Oysiyl commited on
Commit
0a9ecd7
·
1 Parent(s): ffdb0ac

add archive unpacking plus enable torch.compile skip if mps

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -1,5 +1,16 @@
1
- import json
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
3
  import random
4
  import sys
5
  import warnings
@@ -355,8 +366,8 @@ def _apply_torch_compile_optimizations():
355
  model=standard_model,
356
  backend="inductor",
357
  mode="reduce-overhead", # Best for iterative sampling
358
- fullgraph=True, # Now possible: timestep_embedding fixed + no progress hooks
359
- dynamic=True, # Support all sizes (512-1024, step 64) with one kernel
360
  keys=["diffusion_model"], # Compile UNet only
361
  )
362
  print(" ✓ Compiled standard pipeline diffusion model")
@@ -367,8 +378,8 @@ def _apply_torch_compile_optimizations():
367
  model=artistic_model,
368
  backend="inductor",
369
  mode="reduce-overhead",
370
- fullgraph=True, # Now possible: timestep_embedding fixed + no progress hooks
371
- dynamic=True, # Support all sizes (512-1024, step 64) with one kernel
372
  keys=["diffusion_model"],
373
  )
374
  print(" ✓ Compiled artistic pipeline diffusion model")
@@ -379,14 +390,15 @@ def _apply_torch_compile_optimizations():
379
  print(" Continuing without compilation (slower but functional)\n")
380
 
381
 
382
- # torch.compile DISABLED: Multiple device access issues in ComfyUI codebase
383
- # Issues found:
384
- # 1. comfy/ldm/modules/diffusionmodules/util.py - timestep_embedding (FIXED with .to())
385
- # 2. comfy_extras/nodes_freelunch.py:94 - hsp.device check in output_block_patch
386
- # With fullgraph=True, compilation traces too deep and hits these ConstantVariable errors
387
- # App still uses bfloat16 optimization for 1.3-1.5× speedup
388
- print("ℹ️ torch.compile disabled (ComfyUI device access incompatibilities)")
389
- print(" App uses bfloat16 + VAE tiling + cache clearing for optimization")
 
390
 
391
 
392
  @spaces.GPU(duration=60)
 
 
1
  import os
2
+ import tarfile
3
+
4
+ # Extract pre-compiled Triton kernels if they exist
5
+ if os.path.exists("triton_cache.tar.gz") and not os.path.exists(
6
+ os.path.expanduser("~/.triton/cache")
7
+ ):
8
+ print("📦 Extracting pre-compiled Triton kernels...")
9
+ with tarfile.open("triton_cache.tar.gz", "r:gz") as tar:
10
+ tar.extractall(path=os.path.expanduser("~"))
11
+ print("✅ Triton kernels ready!")
12
+
13
+ import json
14
  import random
15
  import sys
16
  import warnings
 
366
  model=standard_model,
367
  backend="inductor",
368
  mode="reduce-overhead", # Best for iterative sampling
369
+ fullgraph=False, # Allow SAG to capture attention maps
370
+ dynamic=False, # Support all sizes (512-1024, step 64) with one kernel
371
  keys=["diffusion_model"], # Compile UNet only
372
  )
373
  print(" ✓ Compiled standard pipeline diffusion model")
 
378
  model=artistic_model,
379
  backend="inductor",
380
  mode="reduce-overhead",
381
+ fullgraph=False, # Allow SAG to capture attention maps
382
+ dynamic=False, # Support all sizes (512-1024, step 64) with one kernel
383
  keys=["diffusion_model"],
384
  )
385
  print(" ✓ Compiled artistic pipeline diffusion model")
 
390
  print(" Continuing without compilation (slower but functional)\n")
391
 
392
 
393
+ # Enable torch.compile optimizations (timestep_embedding fixed!)
394
+ # Now works with fullgraph=False for compatibility with SAG
395
+ # Skip on MPS (MacBooks) - torch.compile with MPS can cause issues
396
+ if not torch.backends.mps.is_available():
397
+ _apply_torch_compile_optimizations()
398
+ else:
399
+ print(
400
+ "ℹ️ torch.compile skipped on MPS (MacBook) - using fp32 optimizations instead"
401
+ )
402
 
403
 
404
  @spaces.GPU(duration=60)