TEST_F(TransposeTest, TMP1) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);
auto tv0 = makeContigConcreteTensor({1, 2048}, DataType::Float);
auto tv1 = makeContigConcreteTensor({1, 2048, 12288}, DataType::Float);
auto tv2 = makeContigConcreteTensor({1, 2048, 1}, DataType::Float);
fusion.addInput(tv0);
fusion.addInput(tv1);
fusion.addInput(tv2);
auto tv3 = broadcast(tv0, {false, false, true});
auto tv4 = sub(tv3, tv1);
auto tv5 = mul(tv4, tv2);
auto tv6 = squeeze(tv5, std::vector<bool>{true, false, false});
fusion.addOutput(tv6);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn({1, 2048}, options);
auto t1 = at::randn({1, 2048, 12288}, options);
auto t2 = at::randn({1, 2048, 1}, options);
FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
testValidate(executor_cache.fusion(), cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
}
Inputs:
T0_g_float[bS0{1}, iS1{2048}]
T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
Outputs:
T6_g_float[iS17{2048}, iS18{12288}]
%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
= broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
= T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
- T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_l_float[bS14{1}, iS15{2048}, iS16{12288}]
= T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
* T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T6_g_float[iS17{2048}, iS18{12288}]
= squeeze( T5_l_float[bS14{1}, iS15{2048}, iS16{12288}], flags = {true, false, false} )
} // %kernel_math
unknown file: Failure
C++ exception with description " INTERNAL ASSERT FAILED at /opt/pytorch/nvfuser/csrc/bfs.h:261, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues.
BFS traversal could not visit some nodes: idg{111 116} idg{112 117} idg{110 115} (from: idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{43 50 57 78 85 92 191} idg{154 159 164 169 174 179 198} idg{155 160 165 170 175 180 199} idg{153 158 163 168 173 178 197 201} idg{0 2 5 8 11 14 27 29 32}), visited: ( idg{193} idg{141 145} idg{140 144} idg{4 13 16 18 31 36} idg{155 160 165 170 175 180 199} idg{40 45 52 59 66 73 80 87 94 101 105} idg{37 46 53 74 81 88 186} idg{153 158 163 168 173 178 197 201} idg{195} idg{194} idg{154 159 164 169 174 179 198} idg{0 2 5 8 11 14 27 29 32} idg{43 50 57 78 85 92 191} idg{60 67 95} idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{192} idg{39 44 51 58 62 63 65 69 70 72 79 86 93 97 98 100 102 104 106} idg{152 157 162 167 172 177 196 200} idg{64 71 99 103 107} idg{151 156 161 166 171 176} idg{38 47 54 75 82 89 187} idg{138 142} idg{139 143} idg{189} idg{1 3 6 9 12 15 17 28 30 33 35})
Inputs:
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
Outputs:
T6_g_float[iS17{2048}, iS18{12288}]
%kernel_math {
T6_g_float[iS17{2048}, iS18{12288}]
= squeeze( T7_g_float[bS19{1}, iS20{2048}, iS21{12288}], flags = {true, false, false} )
} // %kernel_math
Inputs:
T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
T0_g_float[bS0{1}, iS1{2048}]
T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
Outputs:
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
= broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
= T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
- T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_g_float[bS14{1}, iS15{2048}, iS16{12288}]
= T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
* T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
= SegmenterSet( T5_g_float[bS14{1}, iS15{2048}, iS16{12288}] )
} // %kernel_math
g{(transpose)
group id: 24
inputs:
T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}) bool
T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}) __bfloat
T7_g___bfloat[iS19{12288}] (DeviceMesh{0}) __bfloat
T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}) float
T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}) __bfloat
T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) float
T22_g___bfloat[iS50{12288}] (DeviceMesh{0}) __bfloat
outputs:
T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) __bfloat
T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0}) float
T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) float
T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0}) float
T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0}) float
T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0}) __bfloat
T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
= __bfloat2float(T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}));
(36)
T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0})
= broadcast( T22_g___bfloat[iS50{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(176)
T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}) = expand( T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0}) )
(270)
T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0})
= broadcast( T7_g___bfloat[iS19{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(40)
T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) = expand( T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0}) )
(269)
T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0})
= __bfloat2float(T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}));
(47)
T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0})
= __bfloat2float(T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}));
(184)
T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0})
= broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(70)
T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0})
= Set( T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0}), cache_op=Streaming )
(71)
T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0}) = expand( T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0}) )
(273)
T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0})
= broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(42)
T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0})
= Set( T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0}), cache_op=Streaming )
(43)
T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0}) = expand( T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0}) )
(267)
T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
= __bfloat2float(T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}));
(45)
T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0})
= __to_float(T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}));
(35)
T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
= T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
* T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0});
(38)
T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0})
= T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
* double(1.11111);
(44)
T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
= T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
+ T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0});
(49)
T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0})
= T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
- T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0});
(79)
T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) = expand( T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) )
(268)
T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
= T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
- T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0});
(51)
T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
= T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
* T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0});
(178)
T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
= T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
* T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0});
(185)
T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0})
= T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
+ T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0});
(191)
T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0})
= __float2bfloat(T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0}));
(201)
T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0})
= squeeze( T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0}), flags = {true, false, false} )
(222)
}
Issue: Preseg pass is disabled in TransposeTest, so squeeze is not peeled off from the fusion it leads to BFS traversal failure in IndexLowering lowering pass.
To reproduce:
Error if fusion is not segmented at squeeze op.
No error if fusion is segmented at squeeze op (use NVFuserTest to allow preseg pass to segment at squeeze op)
Context of the original issue:
while working on heuristic optimization of inner-outer persistent scheduler, noticed test failure of
tests/python/multidevice/test_transformer.py::test_transformer_backward[TENSOR_PARALLEL], it has a segmented fusion contains a squeeze op.