BFS traversal failure in IndexLowering pass

**Issue: Preseg pass is disabled in TransposeTest, so squeeze is not peeled off from the fusion it leads to BFS traversal failure in IndexLowering lowering pass.** 
To reproduce:
```

TEST_F(TransposeTest, TMP1) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  Fusion& fusion = *fusion_ptr.get();
  FusionGuard fg(&fusion);

  auto tv0 = makeContigConcreteTensor({1, 2048}, DataType::Float);
  auto tv1 = makeContigConcreteTensor({1, 2048, 12288}, DataType::Float);
  auto tv2 = makeContigConcreteTensor({1, 2048, 1}, DataType::Float);
  fusion.addInput(tv0);
  fusion.addInput(tv1);
  fusion.addInput(tv2);
  auto tv3 = broadcast(tv0, {false, false, true});
  auto tv4 = sub(tv3, tv1);
  auto tv5 = mul(tv4, tv2);
  auto tv6 = squeeze(tv5, std::vector<bool>{true, false, false});
  fusion.addOutput(tv6);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  auto t0 = at::randn({1, 2048}, options);
  auto t1 = at::randn({1, 2048, 12288}, options);
  auto t2 = at::randn({1, 2048, 1}, options);

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
  testValidate(executor_cache.fusion(), cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
}
```

**Error if fusion is not segmented at squeeze op.**
```

Inputs:
  T0_g_float[bS0{1}, iS1{2048}]
  T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
  T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
Outputs:
  T6_g_float[iS17{2048}, iS18{12288}]

%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
   = broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
   = T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
   - T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_l_float[bS14{1}, iS15{2048}, iS16{12288}]
   = T4_l_float[bS11{1}, iS12{2048}, iS13{12288}]
   * T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T6_g_float[iS17{2048}, iS18{12288}]
   = squeeze( T5_l_float[bS14{1}, iS15{2048}, iS16{12288}], flags = {true, false, false} )
} // %kernel_math 

unknown file: Failure
C++ exception with description " INTERNAL ASSERT FAILED at /opt/pytorch/nvfuser/csrc/bfs.h:261, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. 
BFS traversal could not visit some nodes:  idg{111 116} idg{112 117} idg{110 115} (from:  idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{43 50 57 78 85 92 191} idg{154 159 164 169 174 179 198} idg{155 160 165 170 175 180 199} idg{153 158 163 168 173 178 197 201} idg{0 2 5 8 11 14 27 29 32}), visited: ( idg{193} idg{141 145} idg{140 144} idg{4 13 16 18 31 36} idg{155 160 165 170 175 180 199} idg{40 45 52 59 66 73 80 87 94 101 105} idg{37 46 53 74 81 88 186} idg{153 158 163 168 173 178 197 201} idg{195} idg{194} idg{154 159 164 169 174 179 198} idg{0 2 5 8 11 14 27 29 32} idg{43 50 57 78 85 92 191} idg{60 67 95} idg{41 42 48 49 55 56 76 77 83 84 90 91 188 190} idg{192} idg{39 44 51 58 62 63 65 69 70 72 79 86 93 97 98 100 102 104 106} idg{152 157 162 167 172 177 196 200} idg{64 71 99 103 107} idg{151 156 161 166 171 176} idg{38 47 54 75 82 89 187} idg{138 142} idg{139 143} idg{189} idg{1 3 6 9 12 15 17 28 30 33 35})
```

**No error if fusion is segmented at squeeze op (use NVFuserTest to allow preseg pass to segment at squeeze op)**
```
Inputs:
  T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
Outputs:
  T6_g_float[iS17{2048}, iS18{12288}]

%kernel_math {
T6_g_float[iS17{2048}, iS18{12288}]
   = squeeze( T7_g_float[bS19{1}, iS20{2048}, iS21{12288}], flags = {true, false, false} )
} // %kernel_math 

Inputs:
  T2_g_float[bS5{1}, iS6{2048}, bS7{1}]
  T0_g_float[bS0{1}, iS1{2048}]
  T1_g_float[bS2{1}, iS3{2048}, iS4{12288}]
Outputs:
  T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]

%kernel_math {
T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
   = broadcast( T0_g_float[bS0{1}, iS1{2048}], flags = {false, false, true} )
T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
   = T3_l_float[bS8{1}, iS9{2048}, bS10{1}]
   - T1_g_float[bS2{1}, iS3{2048}, iS4{12288}];
T5_g_float[bS14{1}, iS15{2048}, iS16{12288}]
   = T4_g_float[bS11{1}, iS12{2048}, iS13{12288}]
   * T2_g_float[bS5{1}, iS6{2048}, bS7{1}];
T7_g_float[bS19{1}, iS20{2048}, iS21{12288}]
   = SegmenterSet( T5_g_float[bS14{1}, iS15{2048}, iS16{12288}] )
} // %kernel_math 
```

**Context of the original issue:**
while working on heuristic optimization of inner-outer persistent scheduler, noticed test failure of `tests/python/multidevice/test_transformer.py::test_transformer_backward[TENSOR_PARALLEL]`, it has a segmented fusion contains a squeeze op.
```
g{(transpose)
group id: 24
inputs:
  T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}) bool
  T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}) __bfloat
  T7_g___bfloat[iS19{12288}] (DeviceMesh{0}) __bfloat
  T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}) float
  T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}) __bfloat
  T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) float
  T22_g___bfloat[iS50{12288}] (DeviceMesh{0}) __bfloat
outputs:
  T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) __bfloat
  T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0}) float
  T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) float
  T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0}) float
  T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0}) float
  T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0}) __bfloat


T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
   = __bfloat2float(T5_g___bfloat[bS14{1}, iS15{2048}, iS16{12288}] (DeviceMesh{0}));
(36)
T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0})
   = broadcast( T22_g___bfloat[iS50{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(176)
T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}) = expand( T185_l___bfloat[bS542{1}, bS543{1}, iS544{12288}] (DeviceMesh{0}) )
(270)
T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0})
   = broadcast( T7_g___bfloat[iS19{12288}] (DeviceMesh{0}), flags = {true, true, false} )
(40)
T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}) = expand( T63_l___bfloat[bS169{1}, bS170{1}, iS171{12288}] (DeviceMesh{0}) )
(269)
T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0})
   = __bfloat2float(T64_g___bfloat[bS172{1}, bS173{1 ex 2048}, iS174{12288}] (DeviceMesh{0}));
(47)
T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0})
   = __bfloat2float(T186_g___bfloat[bS545{1}, bS546{1 ex 2048}, iS547{12288}] (DeviceMesh{0}));
(184)
T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0})
   = broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(70)
T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0})
   = Set( T93_l_float[bS251{1}, iS252{2048}, bS253{1}] (DeviceMesh{0}), cache_op=Streaming )
(71)
T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0}) = expand( T94_g_float[bS254{1}, iS255{2048}, bS256{1}] (DeviceMesh{0}) )
(273)
T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0})
   = broadcast( T8_g_float[bS20{1}, iS21{2048}] (DeviceMesh{0}), flags = {false, false, true} )
(42)
T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0})
   = Set( T65_l_float[bS175{1}, iS176{2048}, bS177{1}] (DeviceMesh{0}), cache_op=Streaming )
(43)
T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0}) = expand( T66_g_float[bS178{1}, iS179{2048}, bS180{1}] (DeviceMesh{0}) )
(267)
T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
   = __bfloat2float(T9_g___bfloat[bS22{1}, iS23{2048}, iS24{12288}] (DeviceMesh{0}));
(45)
T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0})
   = __to_float(T4_g_bool[bS11{1}, iS12{2048}, iS13{12288}] (DeviceMesh{0}));
(35)
T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
   = T59_g_float[bS157{1}, iS158{2048}, iS159{12288}] (DeviceMesh{0})
   * T58_l_float[bS154{1}, iS155{2048}, iS156{12288}] (DeviceMesh{0});
(38)
T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0})
   = T61_l_float[bS163{1}, iS164{2048}, iS165{12288}] (DeviceMesh{0})
   * double(1.11111);
(44)
T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
   = T68_l_float[bS184{1}, iS185{2048}, iS186{12288}] (DeviceMesh{0})
   + T67_l_float[bS181{1}, iS182{2048}, iS183{12288}] (DeviceMesh{0});
(49)
T102_g_float[bS274{1}, iS275{2048}, iS276{12288}] (DeviceMesh{0})
   = T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
   - T98_g_float[bS264{1}, iS265{2048}, bS266{1 ex 12288}] (DeviceMesh{0});
(79)
T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0}) = expand( T10_g_float[bS25{1}, iS26{2048}, bS27{1}] (DeviceMesh{0}) )
(268)
T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
   = T72_g_float[bS196{1}, iS197{2048}, iS198{12288}] (DeviceMesh{0})
   - T71_g_float[bS193{1}, iS194{2048}, bS195{1 ex 12288}] (DeviceMesh{0});
(51)
T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
   = T74_g_float[bS202{1}, iS203{2048}, iS204{12288}] (DeviceMesh{0})
   * T80_g_float[bS218{1}, iS219{2048}, bS220{1 ex 12288}] (DeviceMesh{0});
(178)
T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
   = T187_g_float[bS548{1}, iS549{2048}, iS550{12288}] (DeviceMesh{0})
   * T70_l_float[bS190{1}, bS191{1 ex 2048}, iS192{12288}] (DeviceMesh{0});
(185)
T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0})
   = T194_l_float[bS570{1}, iS571{2048}, iS572{12288}] (DeviceMesh{0})
   + T193_l_float[bS567{1}, bS568{1 ex 2048}, iS569{12288}] (DeviceMesh{0});
(191)
T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0})
   = __float2bfloat(T199_l_float[bS586{1}, iS587{2048}, iS588{12288}] (DeviceMesh{0}));
(201)
T229_g___bfloat[iS662{2048}, iS663{12288}] (DeviceMesh{0})
   = squeeze( T208_g___bfloat[bS617{1}, iS618{2048}, iS619{12288}] (DeviceMesh{0}), flags = {true, false, false} )
(222)
}

```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

BFS traversal failure in IndexLowering pass #5779

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

BFS traversal failure in IndexLowering pass #5779

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions