From 918f14a08233946b42d199932223b3c203190677 Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Thu, 23 Mar 2023 14:01:02 +0000 Subject: [PATCH 01/15] Initial preparations for GPU ported SMC grid configuration. --- model/bin/switch_UKMO_GPU | 1 + model/src/w3gdatmd.F90 | 32 ++++++++ model/src/w3idatmd.F90 | 3 + model/src/w3initmd.F90 | 75 +++++++++++++++++ model/src/w3psmcmd.F90 | 169 +++++++++++++++++++++++++++++++++++++- model/src/w3wavemd.F90 | 12 +++ 6 files changed, 288 insertions(+), 4 deletions(-) create mode 100644 model/bin/switch_UKMO_GPU diff --git a/model/bin/switch_UKMO_GPU b/model/bin/switch_UKMO_GPU new file mode 100644 index 0000000000..505673b159 --- /dev/null +++ b/model/bin/switch_UKMO_GPU @@ -0,0 +1 @@ +F90 SHRD PR3 UNO FLX0 LN1 ST4 NL1 BT1 IC0 IS0 REF0 DB1 TR0 BS0 XX0 WNT1 WNX1 CRT1 CRX1 RWND NOGRB NC4 GPU diff --git a/model/src/w3gdatmd.F90 b/model/src/w3gdatmd.F90 index 95a6726d28..298f780c7a 100644 --- a/model/src/w3gdatmd.F90 +++ b/model/src/w3gdatmd.F90 @@ -1399,6 +1399,38 @@ MODULE W3GDATMD REAL, POINTER :: IS1C1, IS1C2 #endif !/ +#ifdef W3_GPU +! For CALC_USTART routine: +!$ACC DECLARE CREATE(ZZWND, AALPHA) + +! Make some source large lookup tables resident on GPU: +!$ACC DECLARE COPYIN (SSDSBM, SSDSC, DCKI, & +!$ACC SATWEIGHTS, SATINDICES, CUMULW, & +!$ACC QBI, IKTAB) + +!$ACC DECLARE COPYIN( & +!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) + + +!/LS For ROUTINE SEQ SDB1 +!/CB Added sig, dden, nth, nspec +!$ACC DECLARE COPYIN(KDMN, SDBC1, SDBC2, FDONLY, FSSOURCE, & +!$ACC SIG, DDEN, NTH, NK, NSPEC) + +!/LS For ROUTINE SEQ SBT1 +!$ACC DECLARE COPYIN(MAPWN(:), SBTC1) + +!/LS For ROUTINE SEQ LIU_FORWARD_DISPERSION +!$ACC DECLARE COPYIN(IICEHDISP, IICEDDISP, IICEFDISP, IICEHMIN, IICEDISP) + +!/LS W3PSMC +!$ACC DECLARE COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc )& +!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS )& +!$ACC COPYIN(IJKUFc5,IJKUFc6, NSEA, MRFct, ARCTC, CLATF) + +!/LS POST PORT OPTIMISATION +!$ACC DECLARE COPYIN(TRNY, TRNX) +#endif CONTAINS !/ ------------------------------------------------------------------- / diff --git a/model/src/w3idatmd.F90 b/model/src/w3idatmd.F90 index 5a63588912..d4a476b2bd 100644 --- a/model/src/w3idatmd.F90 +++ b/model/src/w3idatmd.F90 @@ -265,6 +265,9 @@ MODULE W3IDATMD #ifdef W3_TIDE LOGICAL, POINTER :: FLLEVTIDE, FLCURTIDE, & FLLEVRESI, FLCURRESI +#endif +#ifdef W3_GPU +!$ACC DECLARE COPYIN(INFLAGS2, ICEP2) #endif !/ CONTAINS diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index f2a794323f..ce12b08c96 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -1480,6 +1480,81 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, #endif ! ! 8. Final MPI set up ----------------------------------------------- / +#ifdef W3_GPU +!/LS From INSNL1 +!$ACC ENTER DATA COPYIN(DAL1, DAL2, DAL3, NSPECX, NSPECY, NFR )& +!$ACC COPYIN(NFRHGH, NFRCHG, AWG1, AWG2, AWG3, AWG4, AWG5 )& +!$ACC COPYIN(AWG6, AWG7, AWG8, SWG1, SWG2, SWG3, SWG4 )& +!$ACC COPYIN(SWG5, SWG6, SWG7, SWG8 )& +!$ACC COPYIN(ICE(:), RHOAIR(:), ASF(:), BERG(:), FPIS(:) )& +!$ACC COPYIN(ICEH(:), ICEF(:), VA(:,:), ICEDMAX(:), UST(:) )& +!$ACC COPYIN(MAPAXY(:), MAPY2(:), MAPX2(:), INFLAGS2(:) )& +!$ACC COPYIN(ATRNY(:,:), ATRNX(:,:), USTDIR(:), FLOGRD(:,:))& +!$ACC COPYIN(CG(:,:), WN(:,:), ALPHA(:,:), MAPSTA(:,:) )& +!$ACC COPYIN(IP11(:), IP12(:), IP13(:), IP14(:) )& +!$ACC COPYIN(IM11(:), IM12(:), IM13(:), IM14(:) )& +!$ACC COPYIN(IP21(:), IP22(:), IP23(:), IP24(:) )& +!$ACC COPYIN(IM21(:), IM22(:), IM23(:), IM24(:) )& +!$ACC COPYIN(IC11(:), IC12(:), IC21(:), IC22(:) )& +!$ACC COPYIN(IC31(:), IC32(:), IC41(:), IC42(:) )& +!$ACC COPYIN(IC51(:), IC52(:), IC61(:), IC62(:) )& +!$ACC COPYIN(IC71(:), IC72(:), IC81(:), IC82(:), AF11(:) ) + +!/LS For ROUTINE SEQ SNL1 +!$ACC ENTER DATA COPYIN(FACHFE, KDCON, KDMN, SNLC1, SNLS1, SNLS2, SNLS3) + +!$ACC UPDATE DEVICE(DELTAUW, DELU, TAUHFT, TAUHFT2, TAUT) + +!! For SDS4 +!$ACC ENTER DATA COPYIN( & +!$ACC SSDSBM, SSDSC, DCKI, & +!$ACC SATWEIGHTS, SATINDICES, CUMULW, & +!$ACC QBI, IKTAB) + +!! FOR SIN4 +!$ACC ENTER DATA COPYIN( & +!$ACC TTAUWSHELTER, SSINTHP, ZZ0RAT, ZZALP, BBETA, SSWELLF, & +!$ACC FWTABLE, DTH, SIG2, DDEN2) + +!! For SPR4 +!$ACC ENTER DATA COPYIN( & +!$ACC ZZWND, AALPHA, & +!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) + + +!/LS For ROUTINE SEQ SDB1 +!$ACC ENTER DATA COPYIN(SDBC1, SDBC2, FDONLY, FSSOURCE, & +!$ACC SIG, DDEN, NTH, NK, NSPEC) +!/LS For ROUTINE SEQ SBT1 +!$ACC ENTER DATA COPYIN(MAPWN(:), SBTC1) + + +!/LS Diffuclt variables +!$ACC ENTER DATA COPYIN(MAPSF(:,:), FLAGST(:), CLATS(:) )& +!$ACC COPYIN(MAPFS(:,:), ISBPI(:), BBPIN(:,:), BBPI0(:,:) )& +!$ACC COPYIN(TBPIN(:), TBPI0(:), GSQRT(:,:) )& +!$ACC COPYIN(TIME(:), DPDY(:,:), DPDX(:,:), DQDX(:,:) )& +!$ACC COPYIN(DQDY(:,:), NSEA, NBI) + +!/LS Updates, already present on device. +!$ACC UPDATE DEVICE(DELAB, DIKCUMUL, DELU, DELTAUW, DELUST, DELALP, & +!$ACC DELTAIL, FWTABLE(:), TAUT(:,:), TAUHFT(:,:), & +!$ACC TAUHFT2(:,:,:)) + +!/LS SMC Grid +!$ACC ENTER DATA COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc, FVERG)& +!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS, ANGARC)& +!$ACC COPYIN(IJKUFc5,IJKUFc6, MRFct, ARCTC, CLATF, CY, CX)& +!$ACC COPYIN(FUNO3, CTRNX, CTRNY, NLvUFc, NLvVFc, NRLv, FLCUR)& +!$ACC COPYIN(DTCFL, DTMS, SX, SY, NGLO, NLvCel) + +!/LS POST PORT OPTIMISATIONS +!$ACC ENTER DATA COPYIN(WHITECAP, BEDFORMS, TAUBBL, TAUICE)& +!$ACC COPYIN(WNMEAN, TAUOCX, TAUOCY, PHICE, PHIBBL)& +!$ACC COPYIN(PHIOC, TWS, CHARN, PHIAW, TAUWNX, TAUWNY, TAUWIX )& +!$ACC COPYIN(TAUWIY, TAUOX, TAUOY, FCUT, DTDYN, TRNX, TRNY)& +!$ACC COPYIN(AS, DW) +#endif ! #ifdef W3_MPI CALL W3MPII ( IMOD ) diff --git a/model/src/w3psmcmd.F90 b/model/src/w3psmcmd.F90 index 6083a78223..4e74276413 100644 --- a/model/src/w3psmcmd.F90 +++ b/model/src/w3psmcmd.F90 @@ -133,7 +133,13 @@ MODULE W3PSMCMD !> @author Jian-Guo Li !> @date 18 Apr 2018 !> +#ifdef W3_GPU + SUBROUTINE W3PSMC (ISP,DTG,VQ,FCNt,AFCN,BCNt,UCFL,VCFL,CQ,CQA, & + ULCFLX,VLCFLY,FUMD,FUDIFX,FVMD,FVDIFY, CXTOT, & + CYTOT, AUN, AVN) +#else SUBROUTINE W3PSMC ( ISP, DTG, VQ ) +#endif !/ !/ +------------------------------------+ !/ | Spherical Multiple-Cell (SMC) grid | @@ -296,11 +302,17 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) LOGICAL :: YFIRST !/ !/ Automatic work arrays - ! +#ifdef W3_GPU + REAL, DIMENSION(:), INTENT(INOUT) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & + CQA, CXTOT, CYTOT, AUN, AVN + REAL, DIMENSION(:), INTENT(INOUT) :: FUMD, FUDIFX, ULCFLX + REAL, DIMENSION(:), INTENT(INOUT) :: FVMD, FVDIFY, VLCFLY +#else REAL, Dimension(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & CQA, CXTOT, CYTOT REAL, Dimension( NUFc) :: FUMD, FUDIFX, ULCFLX REAL, Dimension( NVFc) :: FVMD, FVDIFY, VLCFLY +#endif !/ !/ ------------------------------------------------------------------- / !/ @@ -394,12 +406,48 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) WRITE (NDST,9010) #endif ! +#ifdef W3_GPU + !$ACC KERNELS + DO ISEA=1,NUFc + ULCFLX(ISEA) = 0.0 + FUMD(ISEA) = 0.0 + FUDIFX(ISEA) = 0.0 + ENDDO + !$ACC END KERNELS + + !$ACC KERNELS + DO ISEA=1,NVFc + VLCFLY(ISEA) = 0.0 + FVMD(ISEA) = 0.0 + FVDIFY(ISEA) = 0.0 + ENDDO + !$ACC END KERNELS + + !$ACC KERNELS + DO ISEA=-9,NCel + CQ(ISEA) = 0.0 + CQA(ISEA) = 0.0 + UCFL(ISEA) = 0.0 + VCFL(ISEA) = 0.0 + FCNt(ISEA) = 0.0 + AFCN(ISEA) = 0.0 + BCNt(ISEA) = 0.0 + CXTOT(ISEA) = 0.0 + CYTOT(ISEA) = 0.0 + AUN(ISEA) = 0.0 + AVN(ISEA) = 0.0 + ENDDO + !$ACC END KERNELS +#else ULCFLX = 0. VLCFLY = 0. - +#endif !Li Pass spectral element VQ to CQ and define size-1 cell CFL #ifdef W3_OMPG !$OMP Parallel DO Private(ISEA) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT #endif DO ISEA=1, NSEA !Li Transported variable is divided by CG as in WW3. @@ -409,12 +457,17 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) END DO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif !Li Add current components if any to wave velocity. IF ( FLCUR ) THEN #ifdef W3_OMPG !$OMP Parallel DO Private(ISEA) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT #endif DO ISEA=1, NSEA CXTOT(ISEA) = (CGCOS * CG(IK,ISEA) + CX(ISEA)) @@ -422,11 +475,16 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ENDDO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif ELSE !Li No current case use group speed only. #ifdef W3_OMPG !$OMP Parallel DO Private(ISEA) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT #endif DO ISEA=1, NSEA CXTOT(ISEA) = CGCOS * CG(IK,ISEA) @@ -434,6 +492,8 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) END DO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif !Li End of IF( FLCUR ) block. ENDIF @@ -441,6 +501,10 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) !Li Arctic cell velocity components need to be rotated !Li back to local east referenence system for propagation. IF( ARCTC ) THEN +#ifdef W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT +#endif DO ISEA=NGLO+1, NSEA ARCTH = ANGARC(ISEA-NGLO)*DERA CXC = CXTOT(ISEA)*COS(ARCTH) + CYTOT(ISEA)*SIN(ARCTH) @@ -453,12 +517,18 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) !Li V-component is reset to zero for Polar cell as direction !Li is undefined there. CYTOT(NSEA) = 0.0 +#ifdef W3_GPU + !$ACC END KERNELS +#endif ENDIF !Li Convert velocity components into CFL factors. #ifdef W3_OMPG !$OMP Parallel DO Private(ISEA) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT #endif DO ISEA=1, NSEA UCFL(ISEA) = DTLDX*CXTOT(ISEA)/CLATS(ISEA) @@ -466,12 +536,16 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ENDDO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif !Li Initialise boundary cell CQ and Velocity values. +#ifndef W3_GPU CQ(-9:0)=0.0 UCFL(-9:0)=0.0 VCFL(-9:0)=0.0 +#endif ! ! 3. Loop over frequency-dependent sub-steps -------------------------* ! @@ -684,6 +758,9 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) jvf=NLvVFc(LL) ! ! Use 3rd order UNO3 scheme. JGLi03Sep2015 +#ifdef W3_GPU + !!$ACC KERNELS +#endif IF( FUNO3 ) THEN CALL SMCxUNO3(iuf, juf, CQ, UCFL, ULCFLX, DNND, FUMD, FUDIFX, FMR) ELSE @@ -691,9 +768,15 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) CALL SMCxUNO2(iuf, juf, CQ, UCFL, ULCFLX, DNND, FUMD, FUDIFX, FMR) ENDIF +#ifdef W3_GPU + !!$ACC END KERNELS +#endif ! Store fineset level conservative flux in FCNt advective one in AFCN #ifdef W3_OMPG !$OMP Parallel DO Private(i, L, M, FUTRN) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT PRIVATE(i, L, M, FUTRN) #endif DO i=iuf, juf L=IJKUFc5(i) @@ -707,21 +790,29 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) IF( (CTRNX(M)+CTRNX(L)) .GE. 1.96 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(L) = FCNt(L) - FUTRN ELSE IF( ULCFLX(i) .GE. 0.0 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(L) = FCNt(L) - FUTRN*CTRNX(L) ELSE #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(L) = FCNt(L) - FUTRN*CTRNX(L)*CTRNX(M) ENDIF #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif ! ChrisB: Re-arranged the RHS term below to make it ! valid for OMP ATMOIC directive. @@ -732,25 +823,32 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) IF( (CTRNX(M)+CTRNX(L)) .GE. 1.96 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(M) = FCNt(M) + FUTRN ELSE IF( ULCFLX(i) .GE. 0.0 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(M) = FCNt(M) + FUTRN*CTRNX(M)*CTRNX(L) ELSE #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif FCNt(M) = FCNt(M) + FUTRN*CTRNX(M) ENDIF #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif AFCN(M) = AFCN(M) + (FUMD(i)*UCFL(M)*FMR - FUDIFX(i)) ENDIF - !! !$OMP END CRITICAL ENDDO #ifdef W3_OMPG !$OMP END Parallel DO @@ -761,6 +859,8 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Also divided by another cell x-size as UCFL is in size-1 unit. #ifdef W3_OMPG !$OMP Parallel DO Private(n) +#elif + !$ACC LOOP INDEPENDENT #endif DO n=icl, jcl CQA(n)=CQ(n) + FCNt(n)/FLOAT( IJKCel3(n)*IJKCel4(n) ) @@ -770,8 +870,13 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ENDDO #ifdef W3_OMPG !$OMP END Parallel DO +#elif + !$ACC END KERNELS #endif ! +#ifdef W3_GPU + !!$ACC KERNELS +#endif ! Use 3rd order UNO3 scheme. JGLi03Sep2015 IF( FUNO3 ) THEN CALL SMCyUNO3(ivf, jvf, CQ, VCFL, VLCFLY, DSSD, FVMD, FVDIFY, FMR) @@ -779,10 +884,16 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Call SMCyUNO2 to calculate MFy value CALL SMCyUNO2(ivf, jvf, CQ, VCFL, VLCFLY, DSSD, FVMD, FVDIFY, FMR) ENDIF +#ifdef W3_GPU + !!$ACC END KERNELS +#endif ! ! Store conservative flux in BCNt #ifdef W3_OMPG !$OMP Parallel DO Private(j, L, M, FVTRN) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT PRIVATE(j, L, M, FVTRN) #endif DO j=ivf, jvf L=IJKVFc5(j) @@ -796,16 +907,22 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) IF( (CTRNY(M)+CTRNY(L)) .GE. 1.96 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(L) = BCNt(L) - FVTRN ELSE IF( VLCFLY(j) .GE. 0.0 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(L) = BCNt(L) - FVTRN*CTRNY(L) ELSE #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(L) = BCNt(L) - FVTRN*CTRNY(L)*CTRNY(M) ENDIF @@ -815,16 +932,22 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) IF( (CTRNY(M)+CTRNY(L)) .GE. 1.96 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(M) = BCNt(M) + FVTRN ELSE IF( VLCFLY(j) .GE. 0.0 ) THEN #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(M) = BCNt(M) + FVTRN*CTRNY(M)*CTRNY(L) ELSE #ifdef W3_OMPG !$OMP ATOMIC +#elif W3_GPU + !$ACC ATOMIC #endif BCNt(M) = BCNt(M) + FVTRN*CTRNY(M) ENDIF @@ -841,6 +964,8 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) !! One cosine factor is also needed to be divided for SMC grid. #ifdef W3_OMPG !$OMP Parallel DO Private(n) +#elif W3_GPU + !$ACC LOOP INDEPENDENT #endif DO n=icl, jcl CQ(n)=CQA(n) + BCNt(n)/( CLATS(n)* & @@ -849,6 +974,8 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ENDDO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif !Li Polar cell needs a special area factor, multi-level case. IF( ARCTC .AND. jcl .EQ. NSEA ) THEN @@ -879,30 +1006,50 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) RD1 = 0. RD2 = 1. END IF +#ifdef W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT +#endif DO IBI=1, NBI ISEA = ISBPI(IBI) CQ(ISEA) = (RD1*BBPI0(ISP,IBI) + RD2*BBPIN(ISP,IBI)) & /CG(IK,ISEA) END DO +#ifdef W3_GPU + !$ACC END KERNELS +#endif ENDIF ! !! End of ITLOC DO ENDDO ! Average with 1-2-1 scheme. JGLi20Aug2015 - IF(FVERG) CALL SMCAverg(CQ) + IF ( FVERG ) THEN +#ifdef W3_GPU + !$ACC KERNELS +#endif + CALL SMCAverg(CQ) +#ifdef W3_GPU + !$ACC END KERNELS +#endif + ENDIF ! ! 4. Store results in VQ in proper format --------------------------- * ! #ifdef W3_OMPG !$OMP Parallel DO Private(ISEA) +#elif W3_GPU + !$ACC KERNELS + !$ACC LOOP INDEPENDENT #endif DO ISEA=1, NSEA VQ(ISEA) = MAX ( 0. , CQ(ISEA)*CG(IK,ISEA) ) END DO #ifdef W3_OMPG !$OMP END Parallel DO +#elif W3_GPU + !$ACC END KERNELS #endif ! RETURN @@ -3315,9 +3462,16 @@ SUBROUTINE W3GATHSMC ( ISPEC, FIELD ) ! 1. Shared memory version ------------------------------------------ / ! #ifdef W3_SHRD +#ifdef W3_GPU +!$ACC KERNELS +!$ACC LOOP INDEPENDENT +#endif DO ISEA=1, NSEA FIELD(ISEA) = A(ISPEC,ISEA) END DO +#ifdef W3_GPU +!$ACC END KERNELS +#endif ! RETURN #endif @@ -3547,10 +3701,17 @@ SUBROUTINE W3SCATSMC ( ISPEC, MAPSTA, FIELD ) ! 1. Shared memory version ------------------------------------------ * ! #ifdef W3_SHRD +#ifdef W3_GPU +!$ACC KERNELS +!$ACC LOOP INDEPENDENT +#endif DO ISEA=1, NSEA IXY = MAPSF(ISEA,3) IF ( MAPSTA(IXY) .GE. 1 ) A(ISPEC,ISEA) = FIELD(ISEA) END DO +#ifdef W3_GPU +!$ACC END KERNELS +#endif ! RETURN #endif diff --git a/model/src/w3wavemd.F90 b/model/src/w3wavemd.F90 index e31790c1a9..78a65ddc21 100644 --- a/model/src/w3wavemd.F90 +++ b/model/src/w3wavemd.F90 @@ -601,6 +601,9 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & REAL :: BACANGL #endif integer :: memunit +#ifdef W3_GPU +!$ACC DECLARE CREATE(TAUWX, TAUWY, FIELD) +#endif !/ ------------------------------------------------------------------- / ! 0. Initializations ! @@ -684,6 +687,9 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & ALLOCATE ( FIELD(1-NY:NY*(NX+2)) ) ENDIF ! +#ifdef W3_GPU +!$ACC ENTER DATA COPYIN(FIELD) +#endif LOCAL = IAPROC .LE. NAPROC UGDTUPDATE = .FALSE. IF (FLAGLL) THEN @@ -1903,7 +1909,13 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & IX = 1 #ifdef W3_SMC !!Li Propagation on SMC grid uses UNO2 scheme. +#ifdef W3_GPU + CALL W3PSMC (ISPEC,DTG,FIELD,FCNt,AFCN,BCNt,UCFL,VCFL,CQ, & + CQA,ULCFLX,VLCFLY,FUMD,FUDIFX,FVMD,FVDIFY,CXTOT,& + CYTOT, AUN, AVN) +#else CALL W3PSMC ( ISPEC, DTG, FIELD ) +#endif #endif ! ELSE IF (GTYPE .EQ. UNGTYPE) THEN From 33f9341f88a0e976200a8cb357a32fab15fb9bc3 Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Fri, 24 Mar 2023 09:03:31 +0000 Subject: [PATCH 02/15] Testing changes to compiler and make --- model/bin/cmplr.env | 8 +++++--- model/bin/comp.tmpl | 5 +++++ model/bin/w3_make | 7 +++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/model/bin/cmplr.env b/model/bin/cmplr.env index 217e6be3a9..dada56eeaa 100644 --- a/model/bin/cmplr.env +++ b/model/bin/cmplr.env @@ -251,11 +251,10 @@ if [ "$cmplr" == "gnu" ] || [ "$cmplr" == "gnu_debug" ] || [ "$cmplr" == "gnu_pr # Cray compiler if [ ! -z "$(echo $cmplr | grep cray)" ] ; then - comp_seq='ftn' - comp_mpi='ftn' + comp_seq='gfortran' + comp_mpi='mpif90' fi - # OPTIONS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # common options @@ -267,6 +266,9 @@ if [ "$cmplr" == "gnu" ] || [ "$cmplr" == "gnu_debug" ] || [ "$cmplr" == "gnu_pr # omp options optomp='-fopenmp' + # acc options + optacc='-fopenacc -Minfo=acc' + # optimized options if [ -z "$(echo $cmplr | grep debug)" ] ; then optc="$optc -O3" diff --git a/model/bin/comp.tmpl b/model/bin/comp.tmpl index 3dc544c566..6c7884d3b3 100755 --- a/model/bin/comp.tmpl +++ b/model/bin/comp.tmpl @@ -93,6 +93,11 @@ opt="$opt " fi + # ACC support + if [ "$acc_mod" = 'yes' ] ; then + opt="$opt " + fi + # 2.b Compile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $comp $opt $name.$fext 1> $name.out 2> $name.err diff --git a/model/bin/w3_make b/model/bin/w3_make index 8e662bb0e4..8eab8616ad 100755 --- a/model/bin/w3_make +++ b/model/bin/w3_make @@ -490,6 +490,13 @@ EOF fi export omp_mod + acc_mod=no + if [ -n "`grep GPU $switch_file`" ] + then + acc_mod=yes + fi + export acc_mod + oasis_mod=no if [ -n "`grep OASIS $switch_file`" ] then From 4b5d7e0ed80e9c64f7ee27c4e25c51001017d988 Mon Sep 17 00:00:00 2001 From: lsampson Date: Fri, 24 Mar 2023 14:13:25 +0000 Subject: [PATCH 03/15] Fixed #elif condition missing W3_GPU --- model/src/w3psmcmd.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model/src/w3psmcmd.F90 b/model/src/w3psmcmd.F90 index 4e74276413..42379d1a11 100644 --- a/model/src/w3psmcmd.F90 +++ b/model/src/w3psmcmd.F90 @@ -859,7 +859,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Also divided by another cell x-size as UCFL is in size-1 unit. #ifdef W3_OMPG !$OMP Parallel DO Private(n) -#elif +#elif W3_GPU !$ACC LOOP INDEPENDENT #endif DO n=icl, jcl @@ -870,7 +870,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ENDDO #ifdef W3_OMPG !$OMP END Parallel DO -#elif +#elif W3_GPU !$ACC END KERNELS #endif ! From db2d4ab2e16e853ede6872e54e436e4cba8cd5fd Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Thu, 30 Mar 2023 13:41:42 +0000 Subject: [PATCH 04/15] Optimised the SMC propagation component. Producing valid model output with the same level of performance as the manual port. --- model/bin/switch_UKMO_GPU | 2 +- model/src/w3gdatmd.F90 | 58 ++++----- model/src/w3initmd.F90 | 16 ++- model/src/w3nmlshelmd.F90 | 19 +++ model/src/w3psmcmd.F90 | 251 +++++++++++++++++++++++++++++++++++--- model/src/w3wavemd.F90 | 22 +++- 6 files changed, 316 insertions(+), 52 deletions(-) diff --git a/model/bin/switch_UKMO_GPU b/model/bin/switch_UKMO_GPU index 505673b159..add5fa6ba4 100644 --- a/model/bin/switch_UKMO_GPU +++ b/model/bin/switch_UKMO_GPU @@ -1 +1 @@ -F90 SHRD PR3 UNO FLX0 LN1 ST4 NL1 BT1 IC0 IS0 REF0 DB1 TR0 BS0 XX0 WNT1 WNX1 CRT1 CRX1 RWND NOGRB NC4 GPU +SHRD SMC UNO PR2 RTD FLX0 LN1 ST4 NL1 BT1 IC0 IS0 REF0 DB1 TR0 BS0 WNT1 WNX1 CRT1 CRX1 RWND NOGRB GPU diff --git a/model/src/w3gdatmd.F90 b/model/src/w3gdatmd.F90 index 9975142d7d..261fd40f0f 100644 --- a/model/src/w3gdatmd.F90 +++ b/model/src/w3gdatmd.F90 @@ -1403,35 +1403,35 @@ MODULE W3GDATMD !/ #ifdef W3_GPU ! For CALC_USTART routine: -!$ACC DECLARE CREATE(ZZWND, AALPHA) - -! Make some source large lookup tables resident on GPU: -!$ACC DECLARE COPYIN (SSDSBM, SSDSC, DCKI, & -!$ACC SATWEIGHTS, SATINDICES, CUMULW, & -!$ACC QBI, IKTAB) - -!$ACC DECLARE COPYIN( & -!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) - - -!/LS For ROUTINE SEQ SDB1 -!/CB Added sig, dden, nth, nspec -!$ACC DECLARE COPYIN(KDMN, SDBC1, SDBC2, FDONLY, FSSOURCE, & -!$ACC SIG, DDEN, NTH, NK, NSPEC) - -!/LS For ROUTINE SEQ SBT1 -!$ACC DECLARE COPYIN(MAPWN(:), SBTC1) - -!/LS For ROUTINE SEQ LIU_FORWARD_DISPERSION -!$ACC DECLARE COPYIN(IICEHDISP, IICEDDISP, IICEFDISP, IICEHMIN, IICEDISP) - -!/LS W3PSMC -!$ACC DECLARE COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc )& -!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS )& -!$ACC COPYIN(IJKUFc5,IJKUFc6, NSEA, MRFct, ARCTC, CLATF) - -!/LS POST PORT OPTIMISATION -!$ACC DECLARE COPYIN(TRNY, TRNX) +!!$ACC DECLARE CREATE(ZZWND, AALPHA) +! +!! Make some source large lookup tables resident on GPU: +!!$ACC DECLARE COPYIN (SSDSBM, SSDSC, DCKI, & +!!$ACC SATWEIGHTS, SATINDICES, CUMULW, & +!!$ACC QBI, IKTAB) +! +!!$ACC DECLARE COPYIN( & +!!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) +! +! +!!/LS For ROUTINE SEQ SDB1 +!!/CB Added sig, dden, nth, nspec +!!$ACC DECLARE COPYIN(KDMN, SDBC1, SDBC2, FDONLY, FSSOURCE, & +!!$ACC SIG, DDEN, NTH, NK, NSPEC) +! +!!/LS For ROUTINE SEQ SBT1 +!!$ACC DECLARE COPYIN(MAPWN(:), SBTC1) +! +!!/LS For ROUTINE SEQ LIU_FORWARD_DISPERSION +!!$ACC DECLARE COPYIN(IICEHDISP, IICEDDISP, IICEFDISP, IICEHMIN, IICEDISP) +! +!!/LS W3PSMC +!!$ACC DECLARE COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc )& +!!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS )& +!!$ACC COPYIN(IJKUFc5,IJKUFc6, NSEA, MRFct, ARCTC, CLATF) +! +!!/LS POST PORT OPTIMISATION +!!$ACC DECLARE COPYIN(TRNY, TRNX) #endif CONTAINS diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index 080745f4cd..a96eeb521a 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -444,6 +444,14 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, #endif #ifdef W3_UOST USE W3UOSTMD, ONLY: UOST_SETGRID +#endif +#ifdef W3_GPU + USE W3GDATMD + USE W3WDATMD + USE W3ADATMD + USE W3IDATMD + USE W3ODATMD + USE W3SRC4MD #endif !/ #ifdef W3_MPI @@ -1535,7 +1543,7 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, !/LS For ROUTINE SEQ SNL1 !$ACC ENTER DATA COPYIN(FACHFE, KDCON, KDMN, SNLC1, SNLS1, SNLS2, SNLS3) -!$ACC UPDATE DEVICE(DELTAUW, DELU, TAUHFT, TAUHFT2, TAUT) +!!$ACC UPDATE DEVICE(DELTAUW, DELU, TAUHFT, TAUHFT2, TAUT) !! For SDS4 !$ACC ENTER DATA COPYIN( & @@ -1569,9 +1577,9 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, !$ACC COPYIN(DQDY(:,:), NSEA, NBI) !/LS Updates, already present on device. -!$ACC UPDATE DEVICE(DELAB, DIKCUMUL, DELU, DELTAUW, DELUST, DELALP, & -!$ACC DELTAIL, FWTABLE(:), TAUT(:,:), TAUHFT(:,:), & -!$ACC TAUHFT2(:,:,:)) +!!$ACC UPDATE DEVICE(DELAB, DIKCUMUL, DELU, DELTAUW, DELUST, DELALP, & +!!$ACC DELTAIL, FWTABLE(:), TAUT(:,:), TAUHFT(:,:), & +!!$ACC TAUHFT2(:,:,:)) !/LS SMC Grid !$ACC ENTER DATA COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc, FVERG)& diff --git a/model/src/w3nmlshelmd.F90 b/model/src/w3nmlshelmd.F90 index ac53104b26..bd2058ef27 100644 --- a/model/src/w3nmlshelmd.F90 +++ b/model/src/w3nmlshelmd.F90 @@ -984,6 +984,25 @@ SUBROUTINE READ_HOMOGENEOUS_NML (NDSI, NML_HOMOG_COUNT, NML_HOMOG_INPUT) END IF ! save namelist +!LS Hard coded namelist to force homogenous fields + HOMOG_INPUT(1)%NAME = 'WND' + HOMOG_INPUT(1)%DATE = '20100101 120000' + HOMOG_INPUT(1)%VALUE1 = 5. + HOMOG_INPUT(1)%VALUE2 = 90. + HOMOG_INPUT(1)%VALUE3 = 2. + + HOMOG_INPUT(2)%NAME = 'WND' + HOMOG_INPUT(2)%DATE = '20100101 140000' + HOMOG_INPUT(2)%VALUE1 = 25. + HOMOG_INPUT(2)%VALUE2 = 120. + HOMOG_INPUT(2)%VALUE3 = 2. + + HOMOG_INPUT(3)%NAME = 'LEV' + HOMOG_INPUT(3)%DATE = '20100101 120000' + HOMOG_INPUT(3)%VALUE1 = 5. + + NML_HOMOG_COUNT = HOMOG_COUNT + NML_HOMOG_INPUT = HOMOG_INPUT NML_HOMOG_COUNT = HOMOG_COUNT NML_HOMOG_INPUT = HOMOG_INPUT diff --git a/model/src/w3psmcmd.F90 b/model/src/w3psmcmd.F90 index 42379d1a11..cba5b5f3f5 100644 --- a/model/src/w3psmcmd.F90 +++ b/model/src/w3psmcmd.F90 @@ -261,7 +261,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) IJKVFc5, IJKVFc6,IJKUFc5,IJKUFc6, & NLvCel, NLvUFc, NLvVFc, NRLv, MRFct, & DTCFL, CLATS, DTMS, CTRNX, CTRNY - USE W3GDATMD, ONLY: NGLO, ANGARC, ARCTC + USE W3GDATMD, ONLY: NGLO, ANGARC, ARCTC, CLATF USE W3WDATMD, ONLY: TIME USE W3ADATMD, ONLY: CG, WN, U10, CX, CY, ATRNX, ATRNY, ITIME ! @@ -301,13 +301,18 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) REAL :: PCArea, ARCTH LOGICAL :: YFIRST !/ - !/ Automatic work arrays #ifdef W3_GPU + !/ Inline SMC functions require additional variables + INTEGER :: ij + REAL :: CNST, CNST0, CNST1, CNST2, CNST3, CNST4, CNST5, CNST6, & + CNST7, CNST8, CNST9 + !/ Automatic work arrays REAL, DIMENSION(:), INTENT(INOUT) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & CQA, CXTOT, CYTOT, AUN, AVN REAL, DIMENSION(:), INTENT(INOUT) :: FUMD, FUDIFX, ULCFLX REAL, DIMENSION(:), INTENT(INOUT) :: FVMD, FVDIFY, VLCFLY #else + !/ Automatic work arrays REAL, Dimension(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & CQA, CXTOT, CYTOT REAL, Dimension( NUFc) :: FUMD, FUDIFX, ULCFLX @@ -552,9 +557,15 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) DO ITLOC=1, NTLOC ! ! Initialise net flux arrays. +#ifdef W3_GPU + !$ACC KERNELS +#endif FCNt(-9:NCel) = 0.0 AFCN(-9:NCel) = 0.0 BCNt(-9:NCel) = 0.0 +#ifdef W3_GPU + !$ACC END KERNELS +#endif ! ! Single-resolution SMC grid uses regular grid advection with ! partial blocking enabled when NRLv = 1 @@ -759,17 +770,96 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! ! Use 3rd order UNO3 scheme. JGLi03Sep2015 #ifdef W3_GPU - !!$ACC KERNELS -#endif + !$ACC KERNELS + IF( FUNO3 ) THEN + CNST0=DNND*FMR*FMR*2.0 +!$ACC LOOP INDEPENDENT PRIVATE(i, ij, K, L, M, N, & +!$ACC CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST7,CNST8,CNST9) + DO i=iuf, juf + K=IJKUFc(4,i) + L=IJKUFc(5,i) + M=IJKUFc(6,i) + N=IJKUFc(7,i) + CNST2=FLOAT( IJKCel3(L) ) + CNST3=FLOAT( IJKCel3(M) ) + CNST5=(CQ(M)-CQ(L))/( CNST2 + CNST3 ) + CNST6=0.5*( UCFL(L)+UCFL(M) )*FMR + ULCFLX(i) = CNST6 + CNST8 = FLOAT( IJKUFc(3,i) ) + ij= MAX(L, M) + IF(CNST6 >= 0.0) THEN + IF( M .LE. 0) ULCFLX(i) = UCFL(L)*FMR + CNST1=FLOAT( IJKCel3(K) ) + CNST4=(CQ(L)-CQ(K))/( CNST2 + CNST1 ) + CNST7 = CNST5 - CNST4 + CNST9 = 2.0/( CNST3+CNST2+CNST2+CNST1 ) + IF( Abs(CNST7) .LT. 0.6*CNST9*Abs(CQ(M)-CQ(K)) ) THEN + CNST= CNST5 - ( CNST3+ULCFLX(i) )*CNST7*CNST9/1.5 + ELSE IF( DBLE(CNST4)*DBLE(CNST5) .GT. 0.d0 ) THEN + CNST=Sign(2.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ELSE + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ENDIF + FUMD(i)=(CQ(L) + CNST*(CNST2 - ULCFLX(i)))*CNST8 + ELSE + IF( L .LE. 0) ULCFLX(i) = UCFL(M)*FMR + CNST1=FLOAT( IJKCel3(N) ) + CNST4=(CQ(N)-CQ(M))/( CNST1 + CNST3 ) + CNST7 = CNST4 - CNST5 + CNST9 = 2.0/( CNST2+CNST3+CNST3+CNST1 ) + IF( Abs(CNST7) .LT. 0.6*CNST9*Abs(CQ(N)-CQ(L)) ) THEN + CNST= CNST5 + ( CNST2-ULCFLX(i) )*CNST7*CNST9/1.5 + ELSE IF( DBLE(CNST4)*DBLE(CNST5) .GT. 0.d0 ) THEN + CNST=Sign(2.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ELSE + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ENDIF + FUMD(i)=(CQ(M) - CNST*(CNST3+ULCFLX(i)))*CNST8 + ENDIF + FUDIFX(i)=CNST0*CNST5*CNST8/( CLATS( ij )*CLATS( ij ) ) + END DO + ELSE +!/ LS The SMCxUNO2 routine is inlined to facilitate the OpenACC implicit directives. + CNST0=DNND*FMR*FMR +!$ACC LOOP INDEPENDENT PRIVATE(i, ij,K, L, M, N)& +!$ACC Private(CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST8,CNST9) + DO i=iuf, juf + K=IJKUFc(4,i) + L=IJKUFc(5,i) + M=IJKUFc(6,i) + N=IJKUFc(7,i) + CNST2=FLOAT( IJKCel3(L) ) + CNST3=FLOAT( IJKCel3(M) ) + CNST5=(CQ(M)-CQ(L))/( CNST2 + CNST3 ) + CNST6=0.5*( UCFL(L)+UCFL(M) )*FMR + ULCFLX(i) = CNST6 + CNST8 = FLOAT( IJKUFc(3,i) ) + ij= MAX(L, M) + CNST9 = 2.0/( CLATS( ij )*CLATS( ij ) ) + IF(CNST6 >= 0.0) THEN + IF( M .LE. 0) ULCFLX(i) = UCFL(L)*FMR + CNST1=FLOAT( IJKCel3(K) ) + CNST4=(CQ(L)-CQ(K))/( CNST2 + CNST1 ) + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + FUMD(i)=(CQ(L) + CNST*(CNST2 - ULCFLX(i)))*CNST8 + ELSE + IF( L .LE. 0) ULCFLX(i) = UCFL(M)*FMR + CNST1=FLOAT( IJKCel3(N) ) + CNST4=(CQ(N)-CQ(M))/( CNST1 + CNST3 ) + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + FUMD(i)=(CQ(M) - CNST*(CNST3+ULCFLX(i)))*CNST8 + ENDIF + FUDIFX(i)=DNND*FMR*FMR*CNST5*CNST8*CNST9 + END DO + ENDIF + !$ACC END KERNELS +#else IF( FUNO3 ) THEN CALL SMCxUNO3(iuf, juf, CQ, UCFL, ULCFLX, DNND, FUMD, FUDIFX, FMR) ELSE ! Call SMCxUNO2 to calculate finest level (size-1) MFx value CALL SMCxUNO2(iuf, juf, CQ, UCFL, ULCFLX, DNND, FUMD, FUDIFX, FMR) ENDIF - -#ifdef W3_GPU - !!$ACC END KERNELS #endif ! Store fineset level conservative flux in FCNt advective one in AFCN #ifdef W3_OMPG @@ -874,19 +964,109 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) !$ACC END KERNELS #endif ! -#ifdef W3_GPU - !!$ACC KERNELS -#endif ! Use 3rd order UNO3 scheme. JGLi03Sep2015 +#ifdef W3_GPU + ! Code inline to facilitate GPU port + !$ACC KERNELS + IF( FUNO3 ) THEN + CNST0=DSSD*FMR*FMR*2.0 + +!$ACC LOOP INDEPENDENT PRIVATE(j, k, L, M, N, & +!$ACC CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST7,CNST8,CNST9) + DO j=ivf, jvf + K=IJKVFc(4,j) + L=IJKVFc(5,j) + M=IJKVFc(6,j) + N=IJKVFc(7,j) + CNST2=FLOAT( IJKCel4(L) ) + CNST3=FLOAT( IJKCel4(M) ) + CNST5=(CQ(M)-CQ(L))/( CNST2 + CNST3 ) + CNST6=0.5*( VCFL(L)+VCFL(M) )*FMR + VLCFLY(j) = CNST6 + CNST8=CLATF(j)*FLOAT( IJKVFc(3,j) ) + IF(CNST6 >= 0.0) THEN + IF( M .LE. 0 ) THEN + VLCFLY(j) = VCFL(L)*FMR + CNST3 = CNST2 + ENDIF + CNST1=FLOAT( IJKCel4(K) ) + CNST4=(CQ(L)-CQ(K))/( CNST2 + CNST1 ) + CNST7 = CNST5 - CNST4 + CNST9 = 2.0/( CNST3+CNST2+CNST2+CNST1 ) + IF( Abs(CNST7) .LT. 0.6*CNST9*Abs(CQ(M)-CQ(K)) ) THEN + CNST= CNST5 - ( CNST3+VLCFLY(j) )*CNST7*CNST9/1.5 + ELSE IF( DBLE(CNST4)*DBLE(CNST5) .GT. 0.d0 ) THEN + CNST=Sign(2.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ELSE + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ENDIF + FVMD(j)=( CQ(L) + CNST*(CNST2 - VLCFLY(j)) )*CNST8 + ELSE + IF( L .LE. 0 ) THEN + VLCFLY(j) = VCFL(M)*FMR + CNST2 = CNST3 + ENDIF + CNST1=FLOAT( IJKCel4(N) ) + CNST4=(CQ(N)-CQ(M))/( CNST1 + CNST3 ) + CNST7 = CNST4 - CNST5 + CNST9 = 2.0/( CNST2+CNST3+CNST3+CNST1 ) + IF( Abs(CNST7) .LT. 0.6*CNST9*Abs(CQ(N)-CQ(L)) ) THEN + CNST= CNST5 + ( CNST2-VLCFLY(j) )*CNST7*CNST9/1.5 + ELSE IF( DBLE(CNST4)*DBLE(CNST5) .GT. 0.d0 ) THEN + CNST=Sign(2.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ELSE + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + ENDIF + FVMD(j)=( CQ(M) - CNST*(CNST3 + VLCFLY(j)) )*CNST8 + ENDIF + FVDIFY(j)=CNST0*CNST5*CNST8 + END DO + ELSE + CNST0=DSSD*FMR*FMR*2.0 +!$ACC LOOP INDEPENDENT PRIVATE(j, K, L, M, N )& +!$ACC Private(CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST8) + DO j=ivf, jvf + K=IJKVFc(4,j) + L=IJKVFc(5,j) + M=IJKVFc(6,j) + N=IJKVFc(7,j) + CNST2=FLOAT( IJKCel4(L) ) + CNST3=FLOAT( IJKCel4(M) ) + CNST5=(CQ(M)-CQ(L))/( CNST2 + CNST3 ) + CNST6=0.5*( VCFL(L)+VCFL(M) )*FMR + VLCFLY(j) = CNST6 + CNST8=CLATF(j)*FLOAT( IJKVFc(3,j) ) + IF(CNST6 >= 0.0) THEN + IF( M .LE. 0 ) THEN + VLCFLY(j) = VCFL(L)*FMR + CNST3 = CNST2 + ENDIF + CNST1=FLOAT( IJKCel4(K) ) + CNST4=(CQ(L)-CQ(K))/( CNST2 + CNST1 ) + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + FVMD(j)=( CQ(L) + CNST*(CNST2 - VLCFLY(j)) )*CNST8 + ELSE + IF( L .LE. 0 ) THEN + VLCFLY(j) = VCFL(M)*FMR + CNST2 = CNST3 + ENDIF + CNST1=FLOAT( IJKCel4(N) ) + CNST4=(CQ(N)-CQ(M))/( CNST1 + CNST3 ) + CNST=Sign(1.0, CNST5)*min( Abs(CNST4), Abs(CNST5) ) + FVMD(j)=( CQ(M) - CNST*(CNST3 + VLCFLY(j)) )*CNST8 + ENDIF + FVDIFY(j)=CNST0*CNST5*CNST8 + END DO + ENDIF + !$ACC END KERNELS +#else IF( FUNO3 ) THEN CALL SMCyUNO3(ivf, jvf, CQ, VCFL, VLCFLY, DSSD, FVMD, FVDIFY, FMR) ELSE ! Call SMCyUNO2 to calculate MFy value CALL SMCyUNO2(ivf, jvf, CQ, VCFL, VLCFLY, DSSD, FVMD, FVDIFY, FMR) ENDIF -#ifdef W3_GPU - !!$ACC END KERNELS -#endif +#endif ! ! Store conservative flux in BCNt #ifdef W3_OMPG @@ -1026,11 +1206,48 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Average with 1-2-1 scheme. JGLi20Aug2015 IF ( FVERG ) THEN #ifdef W3_GPU - !$ACC KERNELS -#endif + !$ACC KERNELS + AUN = 0. + AVN = 0. + CNST0 = CQ(NSEA) + !$ACC LOOP INDEPENDENT PRIVATE(i, L, M, CNST5) + DO i=1, NUFc + L=IJKUFc5(i) + M=IJKUFc6(i) + CNST5=Real( IJKUFc(3,i) )*(CQ(M)+CQ(L)) + IF( L > 0 ) THEN + !$ACC ATOMIC + AUN(L) = AUN(L) + CNST5 + ENDIF + IF( M > 0 ) THEN + !$ACC ATOMIC + AUN(M) = AUN(M) + CNST5 + ENDIF + END DO + !$ACC LOOP INDEPENDENT PRIVATE(j, L, M, CNST6) + DO j=1, NVFc + L=IJKVFc5(j) + M=IJKVFc6(j) + CNST6=Real( IJKVfc(3,j) )*(CQ(M)+CQ(L)) + IF( L > 0 ) THEN + !$ACC ATOMIC + AVN(L) = AVN(L) + CNST6 + ENDIF + IF( M > 0 ) THEN + !$ACC ATOMIC + AVN(M) = AVN(M) + CNST6 + ENDIF + END DO + !$ACC LOOP INDEPENDENT PRIVATE(n, CNST3, CNST4) + DO n=1, NSEA + CNST3=0.125/Real( IJKCel3(n) ) + CNST4=0.125/Real( IJKCel4(n) ) + CQ(n)= AUN(n)*CNST4 + AVN(n)*CNST3 + END DO + IF( ARCTC ) CQ(NSEA) = CNST0 + !$ACC END KERNELS +#else CALL SMCAverg(CQ) -#ifdef W3_GPU - !$ACC END KERNELS #endif ENDIF diff --git a/model/src/w3wavemd.F90 b/model/src/w3wavemd.F90 index fd96254d5d..661b409551 100644 --- a/model/src/w3wavemd.F90 +++ b/model/src/w3wavemd.F90 @@ -197,6 +197,9 @@ MODULE W3WAVEMD ! PUBLIC !/ +#ifdef W3_GPU + REAL :: s1, e1 +#endif CONTAINS !/ ------------------------------------------------------------------- / !> @@ -602,6 +605,10 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & #endif integer :: memunit #ifdef W3_GPU + REAL, DIMENSION(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, & + CQ, CQA, CXTOT, CYTOT, AUN, AVN + REAL, DIMENSION(NUFc) :: ULCFLX, FUMD, FUDIFX + REAL, DIMENSION(NVFc) :: VLCFLY, FVMD, FVDIFY !$ACC DECLARE CREATE(TAUWX, TAUWY, FIELD) #endif !/ ------------------------------------------------------------------- / @@ -1860,6 +1867,9 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & ! ! Initialize FIELD variable FIELD = 0. +#ifdef W3_GPU + CALL CPU_TIME(s1) +#endif ! DO ISPEC=1, NSPEC IF ( IAPPRO(ISPEC) .EQ. IAPROC ) THEN @@ -1941,6 +1951,10 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & END IF #endif call print_memcheck(memunit, 'memcheck_____:'//' WW3_WAVE TIME LOOP 17') +#ifdef W3_GPU + CALL CPU_TIME(e1) + PRINT*,'SMC Prop time: ', e1-s1 +#endif ! !Li Initialise IK IX IY in case ARC option is not used to avoid warnings. IK=1 @@ -2166,7 +2180,9 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & !$OMP& REFLEC,REFLED,D50,PSIC,TMP1,TMP2,TMP3,TMP4) !$OMP DO SCHEDULE (DYNAMIC,1) #endif - +#ifdef W3_GPU + CALL CPU_TIME(s1) +#endif ! DO JSEA=1, NSEAL CALL INIT_GET_ISEA(ISEA, JSEA) @@ -2279,6 +2295,10 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & !$OMP END PARALLEL #endif ! +#ifdef W3_GPU + CALL CPU_TIME(e1) + PRINT*,'SRC time: ', e1-s1 +#endif #ifdef W3_PDLIB #ifdef W3_DEBUGSRC WRITE(740+IAPROC,*) 'ITIME=', ITIME, ' IT=', IT From 6be6b7f70f04571bd8e8c0148a6d0de40d192260 Mon Sep 17 00:00:00 2001 From: lsampson Date: Fri, 31 Mar 2023 14:52:39 +0000 Subject: [PATCH 05/15] Added comments for the addition of GPU switches. --- model/src/w3psmcmd.F90 | 14 ++++++++++---- model/src/w3wavemd.F90 | 18 +----------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/model/src/w3psmcmd.F90 b/model/src/w3psmcmd.F90 index cba5b5f3f5..8e7e49c5ae 100644 --- a/model/src/w3psmcmd.F90 +++ b/model/src/w3psmcmd.F90 @@ -134,7 +134,8 @@ MODULE W3PSMCMD !> @date 18 Apr 2018 !> #ifdef W3_GPU - SUBROUTINE W3PSMC (ISP,DTG,VQ,FCNt,AFCN,BCNt,UCFL,VCFL,CQ,CQA, & + !/LS GPU Code uses hoisted arrays and functional call. + SUBROUTINE W3PSMC (ISP,DTG,VQ,FCNt,AFCN,BCNt,UCFL,VCFL,CQ,CQA, & ULCFLX,VLCFLY,FUMD,FUDIFX,FVMD,FVDIFY, CXTOT, & CYTOT, AUN, AVN) #else @@ -302,11 +303,12 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) LOGICAL :: YFIRST !/ #ifdef W3_GPU - !/ Inline SMC functions require additional variables + !/LS Inlined SMC functions require additional variables INTEGER :: ij REAL :: CNST, CNST0, CNST1, CNST2, CNST3, CNST4, CNST5, CNST6, & CNST7, CNST8, CNST9 - !/ Automatic work arrays + !/LS GPU ported code using functions via subroutine call to avoid + !/LS automatic work arrays. REAL, DIMENSION(:), INTENT(INOUT) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & CQA, CXTOT, CYTOT, AUN, AVN REAL, DIMENSION(:), INTENT(INOUT) :: FUMD, FUDIFX, ULCFLX @@ -412,6 +414,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) #endif ! #ifdef W3_GPU + !/LS Explicit initialisation for hoisted arrays. !$ACC KERNELS DO ISEA=1,NUFc ULCFLX(ISEA) = 0.0 @@ -772,6 +775,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) #ifdef W3_GPU !$ACC KERNELS IF( FUNO3 ) THEN +!/LS The SMCxUNO3 routine is inlined to facilitate the OpenACC implicit directives. CNST0=DNND*FMR*FMR*2.0 !$ACC LOOP INDEPENDENT PRIVATE(i, ij, K, L, M, N, & !$ACC CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST7,CNST8,CNST9) @@ -969,8 +973,8 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Code inline to facilitate GPU port !$ACC KERNELS IF( FUNO3 ) THEN +!/LS The SMCyUNO3 routine is inlined to facilitate the OpenACC implicit directives. CNST0=DSSD*FMR*FMR*2.0 - !$ACC LOOP INDEPENDENT PRIVATE(j, k, L, M, N, & !$ACC CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST7,CNST8,CNST9) DO j=ivf, jvf @@ -1022,6 +1026,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) FVDIFY(j)=CNST0*CNST5*CNST8 END DO ELSE +!/LS The SMCyUNO2 routine is inlined to facilitate the OpenACC implicit directives. CNST0=DSSD*FMR*FMR*2.0 !$ACC LOOP INDEPENDENT PRIVATE(j, K, L, M, N )& !$ACC Private(CNST,CNST1,CNST2,CNST3,CNST4,CNST5,CNST6,CNST8) @@ -1206,6 +1211,7 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) ! Average with 1-2-1 scheme. JGLi20Aug2015 IF ( FVERG ) THEN #ifdef W3_GPU +!/ LS The SMCAverg routine is inlined to facilitate the OpenACC implicit directives. !$ACC KERNELS AUN = 0. AVN = 0. diff --git a/model/src/w3wavemd.F90 b/model/src/w3wavemd.F90 index 661b409551..05545413ee 100644 --- a/model/src/w3wavemd.F90 +++ b/model/src/w3wavemd.F90 @@ -197,9 +197,6 @@ MODULE W3WAVEMD ! PUBLIC !/ -#ifdef W3_GPU - REAL :: s1, e1 -#endif CONTAINS !/ ------------------------------------------------------------------- / !> @@ -605,6 +602,7 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & #endif integer :: memunit #ifdef W3_GPU + !/LS Hoisted automatic arrays from W3PSMC REAL, DIMENSION(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, & CQ, CQA, CXTOT, CYTOT, AUN, AVN REAL, DIMENSION(NUFc) :: ULCFLX, FUMD, FUDIFX @@ -1867,9 +1865,6 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & ! ! Initialize FIELD variable FIELD = 0. -#ifdef W3_GPU - CALL CPU_TIME(s1) -#endif ! DO ISPEC=1, NSPEC IF ( IAPPRO(ISPEC) .EQ. IAPROC ) THEN @@ -1951,10 +1946,6 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & END IF #endif call print_memcheck(memunit, 'memcheck_____:'//' WW3_WAVE TIME LOOP 17') -#ifdef W3_GPU - CALL CPU_TIME(e1) - PRINT*,'SMC Prop time: ', e1-s1 -#endif ! !Li Initialise IK IX IY in case ARC option is not used to avoid warnings. IK=1 @@ -2179,9 +2170,6 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & !$OMP PARALLEL PRIVATE (JSEA,ISEA,IX,IY,DELA,DELX,DELY, & !$OMP& REFLEC,REFLED,D50,PSIC,TMP1,TMP2,TMP3,TMP4) !$OMP DO SCHEDULE (DYNAMIC,1) -#endif -#ifdef W3_GPU - CALL CPU_TIME(s1) #endif ! DO JSEA=1, NSEAL @@ -2295,10 +2283,6 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & !$OMP END PARALLEL #endif ! -#ifdef W3_GPU - CALL CPU_TIME(e1) - PRINT*,'SRC time: ', e1-s1 -#endif #ifdef W3_PDLIB #ifdef W3_DEBUGSRC WRITE(740+IAPROC,*) 'ITIME=', ITIME, ' IT=', IT From b8543e21e3440203eeb8b5e3116b0151bade0e3e Mon Sep 17 00:00:00 2001 From: lsampson Date: Thu, 6 Apr 2023 08:31:51 +0000 Subject: [PATCH 06/15] Updating OpenACC directives to be more optimal and avoid some USE statements. --- model/bin/switch_UKMO_GPU | 2 +- model/src/w3initmd.F90 | 69 +++++++++++++++++++-------------------- regtests/bin/matrix.base | 4 +++ 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/model/bin/switch_UKMO_GPU b/model/bin/switch_UKMO_GPU index add5fa6ba4..dd8670b5d6 100644 --- a/model/bin/switch_UKMO_GPU +++ b/model/bin/switch_UKMO_GPU @@ -1 +1 @@ -SHRD SMC UNO PR2 RTD FLX0 LN1 ST4 NL1 BT1 IC0 IS0 REF0 DB1 TR0 BS0 WNT1 WNX1 CRT1 CRX1 RWND NOGRB GPU +SHRD SMC UNO PR2 RTD FLX0 LN1 ST0 NL1 BT1 IC0 IS0 REF0 DB1 TR0 BS0 WNT1 WNX1 CRT1 CRX1 RWND NOGRB GPU diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index a96eeb521a..f3d5a10a94 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -451,7 +451,6 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, USE W3ADATMD USE W3IDATMD USE W3ODATMD - USE W3SRC4MD #endif !/ #ifdef W3_MPI @@ -1522,26 +1521,26 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, ! 8. Final MPI set up ----------------------------------------------- / #ifdef W3_GPU !/LS From INSNL1 -!$ACC ENTER DATA COPYIN(DAL1, DAL2, DAL3, NSPECX, NSPECY, NFR )& -!$ACC COPYIN(NFRHGH, NFRCHG, AWG1, AWG2, AWG3, AWG4, AWG5 )& -!$ACC COPYIN(AWG6, AWG7, AWG8, SWG1, SWG2, SWG3, SWG4 )& -!$ACC COPYIN(SWG5, SWG6, SWG7, SWG8 )& -!$ACC COPYIN(ICE(:), RHOAIR(:), ASF(:), BERG(:), FPIS(:) )& -!$ACC COPYIN(ICEH(:), ICEF(:), VA(:,:), ICEDMAX(:), UST(:) )& -!$ACC COPYIN(MAPAXY(:), MAPY2(:), MAPX2(:), INFLAGS2(:) )& -!$ACC COPYIN(ATRNY(:,:), ATRNX(:,:), USTDIR(:), FLOGRD(:,:))& -!$ACC COPYIN(CG(:,:), WN(:,:), ALPHA(:,:), MAPSTA(:,:) )& -!$ACC COPYIN(IP11(:), IP12(:), IP13(:), IP14(:) )& -!$ACC COPYIN(IM11(:), IM12(:), IM13(:), IM14(:) )& -!$ACC COPYIN(IP21(:), IP22(:), IP23(:), IP24(:) )& -!$ACC COPYIN(IM21(:), IM22(:), IM23(:), IM24(:) )& -!$ACC COPYIN(IC11(:), IC12(:), IC21(:), IC22(:) )& -!$ACC COPYIN(IC31(:), IC32(:), IC41(:), IC42(:) )& -!$ACC COPYIN(IC51(:), IC52(:), IC61(:), IC62(:) )& -!$ACC COPYIN(IC71(:), IC72(:), IC81(:), IC82(:), AF11(:) ) - -!/LS For ROUTINE SEQ SNL1 -!$ACC ENTER DATA COPYIN(FACHFE, KDCON, KDMN, SNLC1, SNLS1, SNLS2, SNLS3) +!!$ACC ENTER DATA COPYIN(DAL1, DAL2, DAL3, NSPECX, NSPECY, NFR )& +!!$ACC COPYIN(NFRHGH, NFRCHG, AWG1, AWG2, AWG3, AWG4, AWG5 )& +!!$ACC COPYIN(AWG6, AWG7, AWG8, SWG1, SWG2, SWG3, SWG4 )& +!!$ACC COPYIN(SWG5, SWG6, SWG7, SWG8 )& +!!$ACC COPYIN(ICE(:), RHOAIR(:), ASF(:), BERG(:), FPIS(:) )& +!!$ACC COPYIN(ICEH(:), ICEF(:), VA(:,:), ICEDMAX(:), UST(:) )& +!!$ACC COPYIN(MAPAXY(:), MAPY2(:), MAPX2(:), INFLAGS2(:) )& +!!$ACC COPYIN(ATRNY(:,:), ATRNX(:,:), USTDIR(:), FLOGRD(:,:))& +!!$ACC COPYIN(CG(:,:), WN(:,:), ALPHA(:,:), MAPSTA(:,:) )& +!!$ACC COPYIN(IP11(:), IP12(:), IP13(:), IP14(:) )& +!!$ACC COPYIN(IM11(:), IM12(:), IM13(:), IM14(:) )& +!!$ACC COPYIN(IP21(:), IP22(:), IP23(:), IP24(:) )& +!!$ACC COPYIN(IM21(:), IM22(:), IM23(:), IM24(:) )& +!!$ACC COPYIN(IC11(:), IC12(:), IC21(:), IC22(:) )& +!!$ACC COPYIN(IC31(:), IC32(:), IC41(:), IC42(:) )& +!!$ACC COPYIN(IC51(:), IC52(:), IC61(:), IC62(:) )& +!!$ACC COPYIN(IC71(:), IC72(:), IC81(:), IC82(:), AF11(:) ) +! +!!/LS For ROUTINE SEQ SNL1 +!!$ACC ENTER DATA COPYIN(FACHFE, KDCON, KDMN, SNLC1, SNLS1, SNLS2, SNLS3) !!$ACC UPDATE DEVICE(DELTAUW, DELU, TAUHFT, TAUHFT2, TAUT) @@ -1552,29 +1551,29 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, !$ACC QBI, IKTAB) !! FOR SIN4 -!$ACC ENTER DATA COPYIN( & -!$ACC TTAUWSHELTER, SSINTHP, ZZ0RAT, ZZALP, BBETA, SSWELLF, & -!$ACC FWTABLE, DTH, SIG2, DDEN2) +!!$ACC ENTER DATA COPYIN( & +!!$ACC TTAUWSHELTER, SSINTHP, ZZ0RAT, ZZALP, BBETA, SSWELLF, & +!!$ACC FWTABLE, DTH, SIG2, DDEN2) !! For SPR4 -!$ACC ENTER DATA COPYIN( & -!$ACC ZZWND, AALPHA, & -!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) +!!$ACC ENTER DATA COPYIN( & +!!$ACC ZZWND, AALPHA, & +!!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) !/LS For ROUTINE SEQ SDB1 -!$ACC ENTER DATA COPYIN(SDBC1, SDBC2, FDONLY, FSSOURCE, & -!$ACC SIG, DDEN, NTH, NK, NSPEC) +!!$ACC ENTER DATA COPYIN(SDBC1, SDBC2, FDONLY, FSSOURCE, & +!!$ACC SIG, DDEN, NTH, NK, NSPEC) !/LS For ROUTINE SEQ SBT1 -!$ACC ENTER DATA COPYIN(MAPWN(:), SBTC1) +!!$ACC ENTER DATA COPYIN(MAPWN(:), SBTC1) !/LS Diffuclt variables -!$ACC ENTER DATA COPYIN(MAPSF(:,:), FLAGST(:), CLATS(:) )& -!$ACC COPYIN(MAPFS(:,:), ISBPI(:), BBPIN(:,:), BBPI0(:,:) )& -!$ACC COPYIN(TBPIN(:), TBPI0(:), GSQRT(:,:) )& -!$ACC COPYIN(TIME(:), DPDY(:,:), DPDX(:,:), DQDX(:,:) )& -!$ACC COPYIN(DQDY(:,:), NSEA, NBI) +!!$ACC ENTER DATA COPYIN(MAPSF(:,:), FLAGST(:), CLATS(:) )& +!!$ACC COPYIN(MAPFS(:,:), ISBPI(:), BBPIN(:,:), BBPI0(:,:) )& +!!$ACC COPYIN(TBPIN(:), TBPI0(:), GSQRT(:,:) )& +!!$ACC COPYIN(TIME(:), DPDY(:,:), DPDX(:,:), DQDX(:,:) )& +!!$ACC COPYIN(DQDY(:,:), NSEA, NBI) !/LS Updates, already present on device. !!$ACC UPDATE DEVICE(DELAB, DIKCUMUL, DELU, DELTAUW, DELUST, DELALP, & diff --git a/regtests/bin/matrix.base b/regtests/bin/matrix.base index e8d3e69dc7..7dc0bfe586 100755 --- a/regtests/bin/matrix.base +++ b/regtests/bin/matrix.base @@ -1867,11 +1867,15 @@ echo ' ' >> matrix.body echo "$rtst $ww3 -w work_SHRD_SMC ww3_tp2.10" >> matrix.body echo "$rtst -w work_SHRD $ww3 ww3_tp2.16" >> matrix.body + echo "$rtst $ww3 -s GPU -w work_SHRD_SMC_GPU ww3_tp2.10" >> matrix.body + echo "$rtst -s GPU -w work_SHRD_GPU $ww3 ww3_tp2.16" >> matrix.body fi if [ "$smcgr" = 'y' ] && [ "$dist" = 'y' ] then echo ' ' >> matrix.body + echo "$rtst -s MPI_GPU -w work_GPU -f -p $mpi -n $np $ww3 ww3_tp2.10" >> matrix.body + echo "$rtst -s MPI_GPU -w work_GPU -f -p $mpi -n $np $ww3 ww3_tp2.16" >> matrix.body echo "$rtst -s MPI -w work_MPI -f -p $mpi -n $np $ww3 ww3_tp2.10" >> matrix.body echo "$rtst -s MPI -w work_MPI -f -p $mpi -n $np $ww3 ww3_tp2.16" >> matrix.body fi From 3d8043f05713d0158e611e419c9ba255a4ac41e2 Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 09:04:55 +0000 Subject: [PATCH 07/15] Added SMC propagation GPU port comments to the manual. --- manual/num/space_SMC.tex | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/manual/num/space_SMC.tex b/manual/num/space_SMC.tex index 5370088a65..75cdc42e9d 100644 --- a/manual/num/space_SMC.tex +++ b/manual/num/space_SMC.tex @@ -196,6 +196,27 @@ \subsubsection{~Spherical Multiple-Cell (SMC) grid} \label{sub:num_space_SMC} combined hybrid and multi-grid parallelization may extend the computer usage to over 100 nodes for the 3 Great Lake sub-grids in \emph{mww3\_test\_09}. +A switch has been created for using the initial OpenACC parallelism of the SMC +grid. This converts the w3psmcmd.F90 module file and function calls to be able +to target a GPU for acceleration. Primarily this has been used with the nvfortran +compiler to success and is built on Isambard using the cmake_build_isambard.sh +script with Met Office specifications. + +For optimal performance on GPU there is a range of changes to function calls, +array declarations and nested subroutine calls, which are all managed by the +switch. Since the GPU will deallocate arrays once they leave scope, local arrays +are hoisted to be in the module scope of w3wavemd.F90, and hence resident on +the GPU for longer. For GPU parallelism using OpenACC, data transfers and +parallelism specifications are applied implicitly where possible. For ease of +application the SMC subroutines are inlined so that the implicit optimisations +are correctly defined and maintain valid model output without more intrusive +coding changes. + +The current implementation of the GPU switch has some limitations, this switch +only targets the multi-resolution grids and has not yet been adapted to the case +of \emph{NRLv .EQ. 1}. This expansion would not be difficult but requires further +inlining and similar changes to the code. + It is recommended to read the smc\_docs/SMC\_Grid\_Guide.pdf or the conference paper \citep{tol:LiS17} at conference web page: http://www.waveworkshop.org/15thWaves/ From e3814136f23462d9af808436af3c7baa06a47f45 Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 12:04:20 +0000 Subject: [PATCH 08/15] Fixed invalid pointer dereference. --- model/src/w3initmd.F90 | 67 ------------------------------------------ model/src/w3psmcmd.F90 | 9 +++--- model/src/w3wavemd.F90 | 20 +++++++++++-- 3 files changed, 22 insertions(+), 74 deletions(-) diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index f3d5a10a94..600a9ff63b 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -444,13 +444,6 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, #endif #ifdef W3_UOST USE W3UOSTMD, ONLY: UOST_SETGRID -#endif -#ifdef W3_GPU - USE W3GDATMD - USE W3WDATMD - USE W3ADATMD - USE W3IDATMD - USE W3ODATMD #endif !/ #ifdef W3_MPI @@ -1520,66 +1513,6 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, ! ! 8. Final MPI set up ----------------------------------------------- / #ifdef W3_GPU -!/LS From INSNL1 -!!$ACC ENTER DATA COPYIN(DAL1, DAL2, DAL3, NSPECX, NSPECY, NFR )& -!!$ACC COPYIN(NFRHGH, NFRCHG, AWG1, AWG2, AWG3, AWG4, AWG5 )& -!!$ACC COPYIN(AWG6, AWG7, AWG8, SWG1, SWG2, SWG3, SWG4 )& -!!$ACC COPYIN(SWG5, SWG6, SWG7, SWG8 )& -!!$ACC COPYIN(ICE(:), RHOAIR(:), ASF(:), BERG(:), FPIS(:) )& -!!$ACC COPYIN(ICEH(:), ICEF(:), VA(:,:), ICEDMAX(:), UST(:) )& -!!$ACC COPYIN(MAPAXY(:), MAPY2(:), MAPX2(:), INFLAGS2(:) )& -!!$ACC COPYIN(ATRNY(:,:), ATRNX(:,:), USTDIR(:), FLOGRD(:,:))& -!!$ACC COPYIN(CG(:,:), WN(:,:), ALPHA(:,:), MAPSTA(:,:) )& -!!$ACC COPYIN(IP11(:), IP12(:), IP13(:), IP14(:) )& -!!$ACC COPYIN(IM11(:), IM12(:), IM13(:), IM14(:) )& -!!$ACC COPYIN(IP21(:), IP22(:), IP23(:), IP24(:) )& -!!$ACC COPYIN(IM21(:), IM22(:), IM23(:), IM24(:) )& -!!$ACC COPYIN(IC11(:), IC12(:), IC21(:), IC22(:) )& -!!$ACC COPYIN(IC31(:), IC32(:), IC41(:), IC42(:) )& -!!$ACC COPYIN(IC51(:), IC52(:), IC61(:), IC62(:) )& -!!$ACC COPYIN(IC71(:), IC72(:), IC81(:), IC82(:), AF11(:) ) -! -!!/LS For ROUTINE SEQ SNL1 -!!$ACC ENTER DATA COPYIN(FACHFE, KDCON, KDMN, SNLC1, SNLS1, SNLS2, SNLS3) - -!!$ACC UPDATE DEVICE(DELTAUW, DELU, TAUHFT, TAUHFT2, TAUT) - -!! For SDS4 -!$ACC ENTER DATA COPYIN( & -!$ACC SSDSBM, SSDSC, DCKI, & -!$ACC SATWEIGHTS, SATINDICES, CUMULW, & -!$ACC QBI, IKTAB) - -!! FOR SIN4 -!!$ACC ENTER DATA COPYIN( & -!!$ACC TTAUWSHELTER, SSINTHP, ZZ0RAT, ZZALP, BBETA, SSWELLF, & -!!$ACC FWTABLE, DTH, SIG2, DDEN2) - -!! For SPR4 -!!$ACC ENTER DATA COPYIN( & -!!$ACC ZZWND, AALPHA, & -!!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) - - -!/LS For ROUTINE SEQ SDB1 -!!$ACC ENTER DATA COPYIN(SDBC1, SDBC2, FDONLY, FSSOURCE, & -!!$ACC SIG, DDEN, NTH, NK, NSPEC) -!/LS For ROUTINE SEQ SBT1 -!!$ACC ENTER DATA COPYIN(MAPWN(:), SBTC1) - - -!/LS Diffuclt variables -!!$ACC ENTER DATA COPYIN(MAPSF(:,:), FLAGST(:), CLATS(:) )& -!!$ACC COPYIN(MAPFS(:,:), ISBPI(:), BBPIN(:,:), BBPI0(:,:) )& -!!$ACC COPYIN(TBPIN(:), TBPI0(:), GSQRT(:,:) )& -!!$ACC COPYIN(TIME(:), DPDY(:,:), DPDX(:,:), DQDX(:,:) )& -!!$ACC COPYIN(DQDY(:,:), NSEA, NBI) - -!/LS Updates, already present on device. -!!$ACC UPDATE DEVICE(DELAB, DIKCUMUL, DELU, DELTAUW, DELUST, DELALP, & -!!$ACC DELTAIL, FWTABLE(:), TAUT(:,:), TAUHFT(:,:), & -!!$ACC TAUHFT2(:,:,:)) - !/LS SMC Grid !$ACC ENTER DATA COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc, FVERG)& !$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS, ANGARC)& diff --git a/model/src/w3psmcmd.F90 b/model/src/w3psmcmd.F90 index 8e7e49c5ae..ce6db990f0 100644 --- a/model/src/w3psmcmd.F90 +++ b/model/src/w3psmcmd.F90 @@ -309,10 +309,11 @@ SUBROUTINE W3PSMC ( ISP, DTG, VQ ) CNST7, CNST8, CNST9 !/LS GPU ported code using functions via subroutine call to avoid !/LS automatic work arrays. - REAL, DIMENSION(:), INTENT(INOUT) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & - CQA, CXTOT, CYTOT, AUN, AVN - REAL, DIMENSION(:), INTENT(INOUT) :: FUMD, FUDIFX, ULCFLX - REAL, DIMENSION(:), INTENT(INOUT) :: FVMD, FVDIFY, VLCFLY + REAL, DIMENSION(-9:NCel), INTENT(INOUT) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & + CQA, CXTOT, CYTOT + REAL, DIMENSION(-9:NSEA), INTENT(INOUT) :: AUN, AVN + REAL, DIMENSION(NUFc), INTENT(INOUT) :: FUMD, FUDIFX, ULCFLX + REAL, DIMENSION(NVFc), INTENT(INOUT) :: FVMD, FVDIFY, VLCFLY #else !/ Automatic work arrays REAL, Dimension(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, CQ, & diff --git a/model/src/w3wavemd.F90 b/model/src/w3wavemd.F90 index 05545413ee..39ce7a1321 100644 --- a/model/src/w3wavemd.F90 +++ b/model/src/w3wavemd.F90 @@ -603,10 +603,10 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & integer :: memunit #ifdef W3_GPU !/LS Hoisted automatic arrays from W3PSMC - REAL, DIMENSION(-9:NCel) :: FCNt, AFCN, BCNt, UCFL, VCFL, & + REAL, ALLOCATABLE, DIMENSION(:) :: FCNt, AFCN, BCNt, UCFL, VCFL, & CQ, CQA, CXTOT, CYTOT, AUN, AVN - REAL, DIMENSION(NUFc) :: ULCFLX, FUMD, FUDIFX - REAL, DIMENSION(NVFc) :: VLCFLY, FVMD, FVDIFY + REAL, ALLOCATABLE, DIMENSION(:) :: ULCFLX, FUMD, FUDIFX + REAL, ALLOCATABLE, DIMENSION(:) :: VLCFLY, FVMD, FVDIFY !$ACC DECLARE CREATE(TAUWX, TAUWY, FIELD) #endif !/ ------------------------------------------------------------------- / @@ -635,6 +635,14 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & ! ALLOCATE(TAUWX(NSEAL), TAUWY(NSEAL)) +#ifdef W3_GPU + IF ( .NOT. ALLOCATED(FCNt) ) THEN + ALLOCATE(FCNt(-9:NCel), AFCN(-9:NCel), BCNt(-9:NCel), UCFL(-9:NCel), VCFL(-9:NCel),& + CQ(-9:NCel), CQA(-9:NCel), CXTOT(-9:NCel), CYTOT(-9:NCel), AUN(-9:NSEA), & + AVN(-9:NSEA), FUMD(NUFc), FUDIFX(NUFc), ULCFLX(NUFc), FVMD(NVFc), & + FVDIFY(NVFc), VLCFLY(NVFc)) + END IF +#endif #ifdef W3_REFRX ALLOCATE(CIK(NSEAL)) #endif @@ -2823,6 +2831,12 @@ SUBROUTINE W3WAVE ( IMOD, ODAT, TEND, STAMP, NO_OUT & ! DEALLOCATE(FIELD) DEALLOCATE(TAUWX, TAUWY) +#ifdef W3_GPU + IF ( ALLOCATED(FCNt) ) THEN + DEALLOCATE(FCNt, AFCN, BCNt, UCFL, VCFL, CQ, CQA, CXTOT, CYTOT, AUN, AVN, & + FUMD, FUDIFX, ULCFLX, FVMD, FVDIFY, VLCFLY) + END IF +#endif ! call print_memcheck(memunit, 'memcheck_____:'//' WW3_WAVE END W3WAVE') ! From d6ade1bd9c6c1959d20a55370aa869ea6d0afb5d Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Wed, 12 Apr 2023 13:05:56 +0100 Subject: [PATCH 09/15] Updated manual comments --- manual/num/space_SMC.tex | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/manual/num/space_SMC.tex b/manual/num/space_SMC.tex index 75cdc42e9d..b0ce68ddd9 100644 --- a/manual/num/space_SMC.tex +++ b/manual/num/space_SMC.tex @@ -196,11 +196,14 @@ \subsubsection{~Spherical Multiple-Cell (SMC) grid} \label{sub:num_space_SMC} combined hybrid and multi-grid parallelization may extend the computer usage to over 100 nodes for the 3 Great Lake sub-grids in \emph{mww3\_test\_09}. -A switch has been created for using the initial OpenACC parallelism of the SMC -grid. This converts the w3psmcmd.F90 module file and function calls to be able -to target a GPU for acceleration. Primarily this has been used with the nvfortran -compiler to success and is built on Isambard using the cmake_build_isambard.sh -script with Met Office specifications. +Following the ongoing manual porting efforts at the Met Office, a switch has been +created for using an initial OpenACC parallelism of the SMC grid. This converts the +w3psmcmd.F90 module file and function calls to be able to target a GPU for +acceleration. Primarily this has been used with the nvfortran compiler to success +after being built on Isambard using the \emph{cmake\_build\_isambard.sh} +script with Met Office specifications. Contact Chris Bunney +(\url{christopher.bunney@metoffice.gov.uk}) for details surrounding the Isambard +Implementation. For optimal performance on GPU there is a range of changes to function calls, array declarations and nested subroutine calls, which are all managed by the @@ -208,14 +211,19 @@ \subsubsection{~Spherical Multiple-Cell (SMC) grid} \label{sub:num_space_SMC} are hoisted to be in the module scope of w3wavemd.F90, and hence resident on the GPU for longer. For GPU parallelism using OpenACC, data transfers and parallelism specifications are applied implicitly where possible. For ease of -application the SMC subroutines are inlined so that the implicit optimisations -are correctly defined and maintain valid model output without more intrusive -coding changes. +application, the SMC propagation subroutines are inlined so that the implicit +optimisations are correctly defined and maintain valid model output without more +intrusive coding changes. The current implementation of the GPU switch has some limitations, this switch only targets the multi-resolution grids and has not yet been adapted to the case of \emph{NRLv .EQ. 1}. This expansion would not be difficult but requires further -inlining and similar changes to the code. +inlining and similar changes to the code, which have not yet been tested properly. +It is also worth noting that the performance of the current implementation is not +at its full potential. This is due to the majority of the code still being +processed on the CPU, and only a small section actively ported to the GPU. We are +viewing the progress so far as a proof of concept and an initial step on determing +the best way to integrate GPU acceleration into the current parallelisation options. It is recommended to read the smc\_docs/SMC\_Grid\_Guide.pdf or the conference paper \citep{tol:LiS17} at conference web page: From 45754ab69cc89c40c13439be9cd8dc887fc793be Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 13:54:04 +0000 Subject: [PATCH 10/15] Cleaned up miscellaneous code changes before final PR. --- model/bin/cmplr.env | 7 ++----- model/bin/comp.tmpl | 5 ----- model/src/w3gdatmd.F90 | 32 -------------------------------- model/src/w3idatmd.F90 | 3 --- model/src/w3initmd.F90 | 28 +++++++++++++++------------- model/src/w3nmlshelmd.F90 | 17 ----------------- 6 files changed, 17 insertions(+), 75 deletions(-) diff --git a/model/bin/cmplr.env b/model/bin/cmplr.env index dada56eeaa..dfd74baa65 100644 --- a/model/bin/cmplr.env +++ b/model/bin/cmplr.env @@ -251,8 +251,8 @@ if [ "$cmplr" == "gnu" ] || [ "$cmplr" == "gnu_debug" ] || [ "$cmplr" == "gnu_pr # Cray compiler if [ ! -z "$(echo $cmplr | grep cray)" ] ; then - comp_seq='gfortran' - comp_mpi='mpif90' + comp_seq='ftn' + comp_mpi='ftn' fi # OPTIONS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -266,9 +266,6 @@ if [ "$cmplr" == "gnu" ] || [ "$cmplr" == "gnu_debug" ] || [ "$cmplr" == "gnu_pr # omp options optomp='-fopenmp' - # acc options - optacc='-fopenacc -Minfo=acc' - # optimized options if [ -z "$(echo $cmplr | grep debug)" ] ; then optc="$optc -O3" diff --git a/model/bin/comp.tmpl b/model/bin/comp.tmpl index 6c7884d3b3..3dc544c566 100755 --- a/model/bin/comp.tmpl +++ b/model/bin/comp.tmpl @@ -93,11 +93,6 @@ opt="$opt " fi - # ACC support - if [ "$acc_mod" = 'yes' ] ; then - opt="$opt " - fi - # 2.b Compile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $comp $opt $name.$fext 1> $name.out 2> $name.err diff --git a/model/src/w3gdatmd.F90 b/model/src/w3gdatmd.F90 index 261fd40f0f..6cd6e91d8f 100644 --- a/model/src/w3gdatmd.F90 +++ b/model/src/w3gdatmd.F90 @@ -1401,38 +1401,6 @@ MODULE W3GDATMD REAL, POINTER :: IS1C1, IS1C2 #endif !/ -#ifdef W3_GPU -! For CALC_USTART routine: -!!$ACC DECLARE CREATE(ZZWND, AALPHA) -! -!! Make some source large lookup tables resident on GPU: -!!$ACC DECLARE COPYIN (SSDSBM, SSDSC, DCKI, & -!!$ACC SATWEIGHTS, SATINDICES, CUMULW, & -!!$ACC QBI, IKTAB) -! -!!$ACC DECLARE COPYIN( & -!!$ACC FTE, FTF, SSTXFTFTAIL, SSTXFTWN, WWNMEANP, WWNMEANPTAIL) -! -! -!!/LS For ROUTINE SEQ SDB1 -!!/CB Added sig, dden, nth, nspec -!!$ACC DECLARE COPYIN(KDMN, SDBC1, SDBC2, FDONLY, FSSOURCE, & -!!$ACC SIG, DDEN, NTH, NK, NSPEC) -! -!!/LS For ROUTINE SEQ SBT1 -!!$ACC DECLARE COPYIN(MAPWN(:), SBTC1) -! -!!/LS For ROUTINE SEQ LIU_FORWARD_DISPERSION -!!$ACC DECLARE COPYIN(IICEHDISP, IICEDDISP, IICEFDISP, IICEHMIN, IICEDISP) -! -!!/LS W3PSMC -!!$ACC DECLARE COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc )& -!!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS )& -!!$ACC COPYIN(IJKUFc5,IJKUFc6, NSEA, MRFct, ARCTC, CLATF) -! -!!/LS POST PORT OPTIMISATION -!!$ACC DECLARE COPYIN(TRNY, TRNX) -#endif CONTAINS !/ ------------------------------------------------------------------- / diff --git a/model/src/w3idatmd.F90 b/model/src/w3idatmd.F90 index d4a476b2bd..5a63588912 100644 --- a/model/src/w3idatmd.F90 +++ b/model/src/w3idatmd.F90 @@ -265,9 +265,6 @@ MODULE W3IDATMD #ifdef W3_TIDE LOGICAL, POINTER :: FLLEVTIDE, FLCURTIDE, & FLLEVRESI, FLCURRESI -#endif -#ifdef W3_GPU -!$ACC DECLARE COPYIN(INFLAGS2, ICEP2) #endif !/ CONTAINS diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index 600a9ff63b..a16a1ac373 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -1513,19 +1513,21 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, ! ! 8. Final MPI set up ----------------------------------------------- / #ifdef W3_GPU -!/LS SMC Grid -!$ACC ENTER DATA COPYIN(IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc, FVERG)& -!$ACC COPYIN(IJKCel3, IJKCel4, IJKVFc5, IJKVFc6, CLATS, ANGARC)& -!$ACC COPYIN(IJKUFc5,IJKUFc6, MRFct, ARCTC, CLATF, CY, CX)& -!$ACC COPYIN(FUNO3, CTRNX, CTRNY, NLvUFc, NLvVFc, NRLv, FLCUR)& -!$ACC COPYIN(DTCFL, DTMS, SX, SY, NGLO, NLvCel) - -!/LS POST PORT OPTIMISATIONS -!$ACC ENTER DATA COPYIN(WHITECAP, BEDFORMS, TAUBBL, TAUICE)& -!$ACC COPYIN(WNMEAN, TAUOCX, TAUOCY, PHICE, PHIBBL)& -!$ACC COPYIN(PHIOC, TWS, CHARN, PHIAW, TAUWNX, TAUWNY, TAUWIX )& -!$ACC COPYIN(TAUWIY, TAUOX, TAUOY, FCUT, DTDYN, TRNX, TRNY)& -!$ACC COPYIN(AS, DW) +!/LS SMC Grid - GDAT +!$ACC ENTER DATA COPYIN(NK, NTH, DTH, XFR, ESIN, ECOS, SIG, NX, NY) & +!$ACC COPYIN(NSEA, SX, SY, MAPSF, FUNO3, FVERG, IJKCel ) & +!$ACC COPYIN(IJKUFc, IJKVFc, NCel, NUFc, NVFc, IJKCel3 ) & +!$ACC COPYIN(IJKCel4, IJKVFc5, IJKVFc6,IJKUFc5,IJKUFc6 ) & +!$ACC COPYIN(NLvCel, NLvUFc, NLvVFc, NRLv, MRFct, DTCFL) & +!$ACC COPYIN(CLATS, DTMS, CTRNX, CTRNY, NGLO, ANGARC ) & +!$ACC COPYIN(ARCTC, CLATF) +!/LS SMC Grid - ADAT +!$ACC ENTER DATA COPYIN(CG, WN, U10, CX, CY, ATRNX, ATRNY, ITIME) +!/LS SMC Grid - IDAT +!$ACC ENTER DATA COPYIN(FLCUR) +!/LS SMC Grid - ODAT +!$ACC ENTER DATA COPYIN(NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN)& +!$ACC COPYIN(ISBPI, BBPI0, BBPIN) #endif ! #ifdef W3_MPI diff --git a/model/src/w3nmlshelmd.F90 b/model/src/w3nmlshelmd.F90 index bd2058ef27..c72aef49ae 100644 --- a/model/src/w3nmlshelmd.F90 +++ b/model/src/w3nmlshelmd.F90 @@ -984,23 +984,6 @@ SUBROUTINE READ_HOMOGENEOUS_NML (NDSI, NML_HOMOG_COUNT, NML_HOMOG_INPUT) END IF ! save namelist -!LS Hard coded namelist to force homogenous fields - HOMOG_INPUT(1)%NAME = 'WND' - HOMOG_INPUT(1)%DATE = '20100101 120000' - HOMOG_INPUT(1)%VALUE1 = 5. - HOMOG_INPUT(1)%VALUE2 = 90. - HOMOG_INPUT(1)%VALUE3 = 2. - - HOMOG_INPUT(2)%NAME = 'WND' - HOMOG_INPUT(2)%DATE = '20100101 140000' - HOMOG_INPUT(2)%VALUE1 = 25. - HOMOG_INPUT(2)%VALUE2 = 120. - HOMOG_INPUT(2)%VALUE3 = 2. - - HOMOG_INPUT(3)%NAME = 'LEV' - HOMOG_INPUT(3)%DATE = '20100101 120000' - HOMOG_INPUT(3)%VALUE1 = 5. - NML_HOMOG_COUNT = HOMOG_COUNT NML_HOMOG_INPUT = HOMOG_INPUT NML_HOMOG_COUNT = HOMOG_COUNT From d4879f213a3ac513c6d7a5c50eb3f9729ec0f30b Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 14:14:17 +0000 Subject: [PATCH 11/15] Whitespace --- model/bin/cmplr.env | 1 + 1 file changed, 1 insertion(+) diff --git a/model/bin/cmplr.env b/model/bin/cmplr.env index dfd74baa65..217e6be3a9 100644 --- a/model/bin/cmplr.env +++ b/model/bin/cmplr.env @@ -255,6 +255,7 @@ if [ "$cmplr" == "gnu" ] || [ "$cmplr" == "gnu_debug" ] || [ "$cmplr" == "gnu_pr comp_mpi='ftn' fi + # OPTIONS - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # common options From 9a663eea670dbcea0e704a8a2c9d7bc476a18035 Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 14:16:11 +0000 Subject: [PATCH 12/15] w3_make is no longer part of the regular build process. --- model/bin/w3_make | 7 ------- 1 file changed, 7 deletions(-) diff --git a/model/bin/w3_make b/model/bin/w3_make index e5e8f38a59..aacc2f82c4 100755 --- a/model/bin/w3_make +++ b/model/bin/w3_make @@ -490,13 +490,6 @@ EOF fi export omp_mod - acc_mod=no - if [ -n "`grep GPU $switch_file`" ] - then - acc_mod=yes - fi - export acc_mod - oasis_mod=no if [ -n "`grep OASIS $switch_file`" ] then From 12b8f9f87da9bd9decb5046b1318e6264199c39e Mon Sep 17 00:00:00 2001 From: lsampson Date: Wed, 12 Apr 2023 14:17:46 +0000 Subject: [PATCH 13/15] Removal of duplicate lines --- model/src/w3nmlshelmd.F90 | 2 -- 1 file changed, 2 deletions(-) diff --git a/model/src/w3nmlshelmd.F90 b/model/src/w3nmlshelmd.F90 index c72aef49ae..ac53104b26 100644 --- a/model/src/w3nmlshelmd.F90 +++ b/model/src/w3nmlshelmd.F90 @@ -986,8 +986,6 @@ SUBROUTINE READ_HOMOGENEOUS_NML (NDSI, NML_HOMOG_COUNT, NML_HOMOG_INPUT) ! save namelist NML_HOMOG_COUNT = HOMOG_COUNT NML_HOMOG_INPUT = HOMOG_INPUT - NML_HOMOG_COUNT = HOMOG_COUNT - NML_HOMOG_INPUT = HOMOG_INPUT END SUBROUTINE READ_HOMOGENEOUS_NML From d7bf35ab542df0c1af613ba71a462f18dff85d2e Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Wed, 12 Apr 2023 14:44:36 +0000 Subject: [PATCH 14/15] Added in the GPU switches for SMC regression tests. --- regtests/ww3_tp2.10/input/switch_GPU | 1 + regtests/ww3_tp2.10/input/switch_MPI_GPU | 1 + regtests/ww3_tp2.16/input/switch_GPU | 1 + regtests/ww3_tp2.16/input/switch_MPI_GPU | 1 + 4 files changed, 4 insertions(+) create mode 100644 regtests/ww3_tp2.10/input/switch_GPU create mode 100644 regtests/ww3_tp2.10/input/switch_MPI_GPU create mode 100644 regtests/ww3_tp2.16/input/switch_GPU create mode 100644 regtests/ww3_tp2.16/input/switch_MPI_GPU diff --git a/regtests/ww3_tp2.10/input/switch_GPU b/regtests/ww3_tp2.10/input/switch_GPU new file mode 100644 index 0000000000..8b5fc8007a --- /dev/null +++ b/regtests/ww3_tp2.10/input/switch_GPU @@ -0,0 +1 @@ +NOGRB SHRD GPU PR2 UNO SMC FLX0 LN0 ST0 NL0 BT0 DB0 TR0 BS0 IC0 IS0 REF0 WNT1 WNX1 CRT1 CRX1 O0 O1 O2 O3 O4 O5 O6 O7 O10 O11 diff --git a/regtests/ww3_tp2.10/input/switch_MPI_GPU b/regtests/ww3_tp2.10/input/switch_MPI_GPU new file mode 100644 index 0000000000..5318787156 --- /dev/null +++ b/regtests/ww3_tp2.10/input/switch_MPI_GPU @@ -0,0 +1 @@ +NOGRB DIST MPI GPU PR2 UNO SMC FLX0 LN0 ST4 NL0 BT0 DB0 TR0 BS0 IC0 IS0 REF0 WNT1 WNX1 CRT1 CRX1 O0 O1 O2 O3 O4 O5 O6 O7 O10 O11 diff --git a/regtests/ww3_tp2.16/input/switch_GPU b/regtests/ww3_tp2.16/input/switch_GPU new file mode 100644 index 0000000000..e5a4db23e7 --- /dev/null +++ b/regtests/ww3_tp2.16/input/switch_GPU @@ -0,0 +1 @@ +SHRD GPU NOGRB SMC PR2 UNO ST0 NL0 BT0 DB0 TR0 BS0 WNT0 WNX1 CRT0 CRX1 FLX0 LN0 IC0 REF0 IS0 FLD0 diff --git a/regtests/ww3_tp2.16/input/switch_MPI_GPU b/regtests/ww3_tp2.16/input/switch_MPI_GPU new file mode 100644 index 0000000000..c5065867a0 --- /dev/null +++ b/regtests/ww3_tp2.16/input/switch_MPI_GPU @@ -0,0 +1 @@ +DIST MPI GPU NOGRB SMC PR2 UNO ST0 NL0 BT0 DB0 TR0 BS0 WNT1 WNX1 CRT1 CRX1 FLX0 LN0 IC0 REF0 IS0 FLD0 From ab26d12211dc2cccf6df18de653ceec2facc4916 Mon Sep 17 00:00:00 2001 From: UKMO-lsampson Date: Thu, 13 Apr 2023 07:37:54 +0000 Subject: [PATCH 15/15] Corrected issue with w3initmd USE statements for ACC. --- model/src/w3initmd.F90 | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/model/src/w3initmd.F90 b/model/src/w3initmd.F90 index a16a1ac373..2428d22294 100644 --- a/model/src/w3initmd.F90 +++ b/model/src/w3initmd.F90 @@ -444,6 +444,21 @@ SUBROUTINE W3INIT ( IMOD, IsMulti, FEXT, MDS, MTRACE, ODAT, FLGRD, FLGR2, FLGD, #endif #ifdef W3_UOST USE W3UOSTMD, ONLY: UOST_SETGRID +#endif +#ifdef W3_GPU + USE W3GDATMD, ONLY: NK, NTH, DTH, XFR, ESIN, ECOS, SIG, NX, NY, & + NSEA, SX, SY, MAPSF, FUNO3, FVERG, & + IJKCel, IJKUFc, IJKVFc, NCel, NUFc, NVFc, & + IJKCel3, IJKCel4, & + IJKVFc5, IJKVFc6,IJKUFc5,IJKUFc6, & + NLvCel, NLvUFc, NLvVFc, NRLv, MRFct, & + DTCFL, CLATS, DTMS, CTRNX, CTRNY + USE W3GDATMD, ONLY: NGLO, ANGARC, ARCTC, CLATF + USE W3ADATMD, ONLY: CG, WN, U10, CX, CY, ATRNX, ATRNY, ITIME + ! + USE W3IDATMD, ONLY: FLCUR + USE W3ODATMD, ONLY: NDSE, NDST, FLBPI, NBI, TBPI0, TBPIN, & + ISBPI, BBPI0, BBPIN #endif !/ #ifdef W3_MPI